From ed3bd4ef75447e17ff487b16858b7e748042e109 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 4 Nov 2021 12:20:05 -0600 Subject: [PATCH 01/19] main: add timing ctx manager --- capa/main.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/capa/main.py b/capa/main.py index d610f3838..9bf45ecc8 100644 --- a/capa/main.py +++ b/capa/main.py @@ -10,6 +10,7 @@ """ import os import sys +import time import hashlib import logging import os.path @@ -17,6 +18,7 @@ import datetime import textwrap import itertools +import contextlib import collections from typing import Any, Dict, List, Tuple @@ -65,6 +67,14 @@ logger = logging.getLogger("capa") +@contextlib.contextmanager +def timing(msg: str): + t0 = time.time() + yield + t1 = time.time() + logger.debug("perf: %s: %0.2fs", msg, t1 - t0) + + def set_vivisect_log_level(level): logging.getLogger("vivisect").setLevel(level) logging.getLogger("vivisect.base").setLevel(level) From f98236046bda6c47739214cb0d9d9806129dd128 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 4 Nov 2021 12:38:35 -0600 Subject: [PATCH 02/19] main: add coarse timing measurements --- capa/main.py | 57 +++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/capa/main.py b/capa/main.py index 9bf45ecc8..4b4248b6c 100644 --- a/capa/main.py +++ b/capa/main.py @@ -168,27 +168,28 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro n_funcs = len(functions) pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions") - for f in pb: - function_address = int(f) - - if extractor.is_library_function(function_address): - function_name = extractor.get_function_name(function_address) - logger.debug("skipping library function 0x%x (%s)", function_address, function_name) - meta["library_functions"][function_address] = function_name - n_libs = len(meta["library_functions"]) - percentage = 100 * (n_libs / n_funcs) - if isinstance(pb, tqdm.tqdm): - pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) - continue + with timing("match functions"): + for f in pb: + function_address = int(f) + + if extractor.is_library_function(function_address): + function_name = extractor.get_function_name(function_address) + logger.debug("skipping library function 0x%x (%s)", function_address, function_name) + meta["library_functions"][function_address] = function_name + n_libs = len(meta["library_functions"]) + percentage = 100 * (n_libs / n_funcs) + if isinstance(pb, tqdm.tqdm): + pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) + continue - function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) - meta["feature_counts"]["functions"][function_address] = feature_count - logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) + function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) + meta["feature_counts"]["functions"][function_address] = feature_count + logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) - for rule_name, res in function_matches.items(): - all_function_matches[rule_name].extend(res) - for rule_name, res in bb_matches.items(): - all_bb_matches[rule_name].extend(res) + for rule_name, res in function_matches.items(): + all_function_matches[rule_name].extend(res) + for rule_name, res in bb_matches.items(): + all_bb_matches[rule_name].extend(res) # collection of features that captures the rule matches within function and BB scopes. # mapping from feature (matched rule) to set of addresses at which it matched. @@ -198,7 +199,8 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro rule = ruleset[rule_name] capa.engine.index_rule_matches(function_and_lower_features, rule, locations) - all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features) + with timing("match file"): + all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features) meta["feature_counts"]["file"] = feature_count matches = { @@ -410,9 +412,11 @@ def get_workspace(path, format, sigpaths): else: raise ValueError("unexpected format: " + format) - viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths) + with timing("load FLIRT"): + viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths) - vw.analyze() + with timing("viv analyze"): + vw.analyze() logger.debug("%s", get_meta_str(vw)) return vw @@ -900,8 +904,10 @@ def main(argv=None): return E_MISSING_FILE try: - rules = get_rules(args.rules, disable_progress=args.quiet) - rules = capa.rules.RuleSet(rules) + with timing("load rules"): + rules = get_rules(args.rules, disable_progress=args.quiet) + rules = capa.rules.RuleSet(rules) + logger.debug( "successfully loaded %s rules", # during the load of the RuleSet, we extract subscope statements into their own rules @@ -1013,7 +1019,8 @@ def main(argv=None): meta = collect_metadata(argv, args.sample, args.rules, extractor) - capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) + with timing("find capabilities"): + capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) meta["analysis"].update(counts) meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities) From 3d068fe3cdbcb9c155d7d23ac10197fd2598d0b2 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Thu, 4 Nov 2021 13:17:38 -0600 Subject: [PATCH 03/19] scripts: add utilities for collecting profile traces --- .gitignore | 2 ++ scripts/profile-time.sh | 62 ++++++++++++++++++++++++++++++++++ scripts/render-time-profile.sh | 24 +++++++++++++ 3 files changed, 88 insertions(+) create mode 100644 scripts/profile-time.sh create mode 100644 scripts/render-time-profile.sh diff --git a/.gitignore b/.gitignore index 4a731c3cc..29161c055 100644 --- a/.gitignore +++ b/.gitignore @@ -115,3 +115,5 @@ isort-output.log black-output.log rule-linter-output.log .vscode +scripts/perf/*.txt +scripts/perf/*.svg diff --git a/scripts/profile-time.sh b/scripts/profile-time.sh new file mode 100644 index 000000000..d09eabd3f --- /dev/null +++ b/scripts/profile-time.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# unset variables are errors +set -o nounset; +# any failed commands are errors +set -o errexit; + +# current_directory is the path to the directory containing this script. +# ref: https://stackoverflow.com/a/4774063/87207 +readonly CD="$( cd "$(dirname "$0")" ; pwd -P )" + +panic() { + echo "[erro]: $@" >&2; + exit 1; +} + +info() { + echo "[info]: $@" >&2; +} + +verbose=false; +debug() { + if "$verbose"; then + echo "[debu]: $@" >&2; + fi +} + +if [ "$(git status | grep "modified: " | grep -v "rules" | grep -v "tests/data")" ]; then + panic "modified content"; +fi + +rev=$(git rev-parse --short HEAD); +info "rev: $rev"; + +mkdir -p "$CD/perf/"; + +info "analyzing PMA 01-01.dll..."; + +pma_out=$( + py-spy record \ + -o "$CD/perf/capa-$rev-PMA0101.svg" \ + -- python -m capa.main \ + -d \ + "$CD/../tests/data/Practical Malware Analysis Lab 01-01.dll_" \ + 2>&1 || true); + +echo "$pma_out" | grep "perf:" | sed -e "s/^.*perf: /perf: /g" | tee "$CD/perf/capa-$rev-PMA0101.txt"; + +info "analyzing kernel32.dll..."; +k32_out=$( + py-spy record \ + -o "$CD/perf/capa-$rev-k32.svg" \ + -- python -m capa.main \ + -d \ + "$CD/../tests/data/kernel32.dll_" \ + 2>&1 || true); + +echo "$k32_out" | grep "perf:" | sed -e "s/^.*perf: /perf: /g" | tee "$CD/perf/capa-$rev-k32.txt"; + +bash "$CD/render-time-profile.sh" "$rev"; + +info "done."; diff --git a/scripts/render-time-profile.sh b/scripts/render-time-profile.sh new file mode 100644 index 000000000..5232a2659 --- /dev/null +++ b/scripts/render-time-profile.sh @@ -0,0 +1,24 @@ +fields=("load FLIRT" "viv analyze" "match functions" "match file" "find capabilities"); + +echo -n "| |"; +for T in "${fields[@]}"; do + printf ' %-17s |' "$T"; +done +echo ""; + +echo -n "|---------|"; +for T in "${fields[@]}"; do + echo -n '-------------------|'; +done +echo ""; + +for rev in "$@"; do + + echo -n "| $rev |"; + for T in "${fields[@]}"; do + V1=$(cat scripts/perf/capa-$rev-PMA0101.txt | grep "$T" | sed -e "s/^.*$T: //g"); + V2=$(cat scripts/perf/capa-$rev-k32.txt | grep "$T" | sed -e "s/^.*$T: //g"); + printf ' %-17s |' "$V1/$V2"; + done + echo ""; +done From 86cab26a69b35e8d131db6ebf4784b07bbad50df Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 14:59:22 -0600 Subject: [PATCH 04/19] add perf counters in module capa.perf --- capa/engine.py | 16 ++++++++++++++++ capa/features/common.py | 12 ++++++++++++ capa/main.py | 4 ++++ capa/perf.py | 3 +++ capa/rules.py | 3 +++ 5 files changed, 38 insertions(+) create mode 100644 capa/perf.py diff --git a/capa/engine.py b/capa/engine.py index 5a295b0db..7a2dea311 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -10,6 +10,7 @@ import collections from typing import Set, Dict, List, Tuple, Union, Mapping, Iterable +import capa.perf import capa.rules import capa.features.common from capa.features.common import Feature @@ -125,6 +126,9 @@ def __init__(self, children, description=None): self.children = children def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.and"] += 1 + results = [child.evaluate(ctx) for child in self.children] success = all(results) return Result(success, self, results) @@ -138,6 +142,9 @@ def __init__(self, children, description=None): self.children = children def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.or"] += 1 + results = [child.evaluate(ctx) for child in self.children] success = any(results) return Result(success, self, results) @@ -151,6 +158,9 @@ def __init__(self, child, description=None): self.child = child def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.not"] += 1 + results = [self.child.evaluate(ctx)] success = not results[0] return Result(success, self, results) @@ -165,6 +175,9 @@ def __init__(self, count, children, description=None): self.children = children def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.some"] += 1 + results = [child.evaluate(ctx) for child in self.children] # note that here we cast the child result as a bool # because we've overridden `__bool__` above. @@ -184,6 +197,9 @@ def __init__(self, child, min=None, max=None, description=None): self.max = max if max is not None else (1 << 64 - 1) def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.range"] += 1 + count = len(ctx.get(self.child, [])) if self.min == 0 and count == 0: return Result(True, self, []) diff --git a/capa/features/common.py b/capa/features/common.py index 8a3d0ec23..9fa5d8bf7 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -12,6 +12,7 @@ import collections from typing import Set, Dict, Union +import capa.perf import capa.engine import capa.features import capa.features.extractors.elf @@ -97,6 +98,8 @@ def __repr__(self): return str(self) def evaluate(self, ctx: Dict["Feature", Set[int]]) -> "capa.engine.Result": + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature." + self.name] += 1 return capa.engine.Result(self in ctx, self, [], locations=ctx.get(self, [])) def freeze_serialize(self): @@ -141,6 +144,9 @@ def __init__(self, value: str, description=None): self.value = value def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.substring"] += 1 + # mapping from string value to list of locations. # will unique the locations later on. matches = collections.defaultdict(list) @@ -226,6 +232,9 @@ def __init__(self, value: str, description=None): ) def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.regex"] += 1 + # mapping from string value to list of locations. # will unique the locations later on. matches = collections.defaultdict(list) @@ -309,6 +318,9 @@ def __init__(self, value: bytes, description=None): self.value = value def evaluate(self, ctx): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.bytes"] += 1 + for feature, locations in ctx.items(): if not isinstance(feature, (Bytes,)): continue diff --git a/capa/main.py b/capa/main.py index 4b4248b6c..a830b2acd 100644 --- a/capa/main.py +++ b/capa/main.py @@ -28,6 +28,7 @@ from pefile import PEFormatError from elftools.common.exceptions import ELFError +import capa.perf import capa.rules import capa.engine import capa.version @@ -1024,6 +1025,9 @@ def main(argv=None): meta["analysis"].update(counts) meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities) + for (counter, count) in capa.perf.counters.most_common(): + logger.debug("perf: counter: %s: %d", counter, count) + if has_file_limitation(rules, capabilities): # bail if capa encountered file limitation e.g. a packed binary # do show the output in verbose mode, though. diff --git a/capa/perf.py b/capa/perf.py new file mode 100644 index 000000000..d1e4083d4 --- /dev/null +++ b/capa/perf.py @@ -0,0 +1,3 @@ +import collections + +counters = collections.Counter() diff --git a/capa/rules.py b/capa/rules.py index a8e65b22b..6960e02bf 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -27,6 +27,7 @@ import yaml import ruamel.yaml +import capa.perf import capa.engine as ceng import capa.features import capa.features.file @@ -620,6 +621,8 @@ def extract_subscope_rules(self): yield new_rule def evaluate(self, features: FeatureSet): + capa.perf.counters["evaluate.feature"] += 1 + capa.perf.counters["evaluate.feature.rule"] += 1 return self.statement.evaluate(features) @classmethod From 6524449ad1500a2496b35542d8ca68c43a52b20a Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 15:23:22 -0600 Subject: [PATCH 05/19] main: perf: human format the numbers --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index a830b2acd..5aaa97d65 100644 --- a/capa/main.py +++ b/capa/main.py @@ -1026,7 +1026,7 @@ def main(argv=None): meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities) for (counter, count) in capa.perf.counters.most_common(): - logger.debug("perf: counter: %s: %d", counter, count) + logger.debug("perf: counter: {:}: {:,}".format(counter, count)) if has_file_limitation(rules, capabilities): # bail if capa encountered file limitation e.g. a packed binary From 3a12472be8ff39b33048a34a23f29522496cd9b3 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 15:23:34 -0600 Subject: [PATCH 06/19] perf: render: show evaluate.feature counter --- scripts/render-time-profile.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/render-time-profile.sh b/scripts/render-time-profile.sh index 5232a2659..1b5627dce 100644 --- a/scripts/render-time-profile.sh +++ b/scripts/render-time-profile.sh @@ -1,4 +1,4 @@ -fields=("load FLIRT" "viv analyze" "match functions" "match file" "find capabilities"); +fields=("load FLIRT" "viv analyze" "match functions" "match file" "find capabilities" "evaluate.feature"); echo -n "| |"; for T in "${fields[@]}"; do @@ -16,8 +16,8 @@ for rev in "$@"; do echo -n "| $rev |"; for T in "${fields[@]}"; do - V1=$(cat scripts/perf/capa-$rev-PMA0101.txt | grep "$T" | sed -e "s/^.*$T: //g"); - V2=$(cat scripts/perf/capa-$rev-k32.txt | grep "$T" | sed -e "s/^.*$T: //g"); + V1=$(cat scripts/perf/capa-$rev-PMA0101.txt | grep "$T: " | sed -e "s/^.*$T: //g"); + V2=$(cat scripts/perf/capa-$rev-k32.txt | grep "$T: " | sed -e "s/^.*$T: //g"); printf ' %-17s |' "$V1/$V2"; done echo ""; From 702d00da9137aba8e48f3c091a38117587b84fdc Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Fri, 5 Nov 2021 15:24:24 -0600 Subject: [PATCH 07/19] gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 29161c055..cc331b545 100644 --- a/.gitignore +++ b/.gitignore @@ -117,3 +117,4 @@ rule-linter-output.log .vscode scripts/perf/*.txt scripts/perf/*.svg +scripts/perf/*.zip From 0629c584e123ca35499fef0c5d80b350bd7d5815 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 11:50:16 -0700 Subject: [PATCH 08/19] common: move Result to capa.common from capa.engine fixes circular import error in capa.features.freeze --- capa/engine.py | 50 ++----------------------------- capa/features/common.py | 65 ++++++++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 58 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 7a2dea311..d6138c9c9 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -13,7 +13,7 @@ import capa.perf import capa.rules import capa.features.common -from capa.features.common import Feature +from capa.features.common import Result, Feature # a collection of features and the locations at which they are found. # @@ -46,15 +46,9 @@ def __str__(self): def __repr__(self): return str(self) - def evaluate(self, features: FeatureSet) -> "Result": + def evaluate(self, features: FeatureSet) -> Result: """ classes that inherit `Statement` must implement `evaluate` - - args: - ctx (defaultdict[Feature, set[VA]]) - - returns: - Result """ raise NotImplementedError() @@ -78,46 +72,6 @@ def replace_child(self, existing, new): children[i] = new -class Result: - """ - represents the results of an evaluation of statements against features. - - instances of this class should behave like a bool, - e.g. `assert Result(True, ...) == True` - - instances track additional metadata about evaluation results. - they contain references to the statement node (e.g. an And statement), - as well as the children Result instances. - - we need this so that we can render the tree of expressions and their results. - """ - - def __init__(self, success: bool, statement: Union[Statement, Feature], children: List["Result"], locations=None): - """ - args: - success (bool) - statement (capa.engine.Statement or capa.features.Feature) - children (list[Result]) - locations (iterable[VA]) - """ - super(Result, self).__init__() - self.success = success - self.statement = statement - self.children = children - self.locations = locations if locations is not None else () - - def __eq__(self, other): - if isinstance(other, bool): - return self.success == other - return False - - def __bool__(self): - return self.success - - def __nonzero__(self): - return self.success - - class And(Statement): """match if all of the children evaluate to True.""" diff --git a/capa/features/common.py b/capa/features/common.py index 9fa5d8bf7..0f01ef523 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -10,10 +10,9 @@ import codecs import logging import collections -from typing import Set, Dict, Union +from typing import Set, Dict, List, Union import capa.perf -import capa.engine import capa.features import capa.features.extractors.elf @@ -47,6 +46,52 @@ def escape_string(s: str) -> str: return s +class Result: + """ + represents the results of an evaluation of statements against features. + + instances of this class should behave like a bool, + e.g. `assert Result(True, ...) == True` + + instances track additional metadata about evaluation results. + they contain references to the statement node (e.g. an And statement), + as well as the children Result instances. + + we need this so that we can render the tree of expressions and their results. + """ + + def __init__( + self, + success: bool, + statement: Union["capa.engine.Statement", "Feature"], + children: List["Result"], + locations=None, + ): + """ + args: + success (bool) + statement (capa.engine.Statement or capa.features.Feature) + children (list[Result]) + locations (iterable[VA]) + """ + super(Result, self).__init__() + self.success = success + self.statement = statement + self.children = children + self.locations = locations if locations is not None else () + + def __eq__(self, other): + if isinstance(other, bool): + return self.success == other + return False + + def __bool__(self): + return self.success + + def __nonzero__(self): + return self.success + + class Feature: def __init__(self, value: Union[str, int, bytes], bitness=None, description=None): """ @@ -97,10 +142,10 @@ def __str__(self): def __repr__(self): return str(self) - def evaluate(self, ctx: Dict["Feature", Set[int]]) -> "capa.engine.Result": + def evaluate(self, ctx: Dict["Feature", Set[int]]) -> "Result": capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature." + self.name] += 1 - return capa.engine.Result(self in ctx, self, [], locations=ctx.get(self, [])) + return Result(self in ctx, self, [], locations=ctx.get(self, [])) def freeze_serialize(self): if self.bitness is not None: @@ -176,9 +221,9 @@ def evaluate(self, ctx): # unlike other features, we cannot return put a reference to `self` directly in a `Result`. # this is because `self` may match on many strings, so we can't stuff the matched value into it. # instead, return a new instance that has a reference to both the substring and the matched values. - return capa.engine.Result(True, _MatchedSubstring(self, matches), [], locations=locations) + return Result(True, _MatchedSubstring(self, matches), [], locations=locations) else: - return capa.engine.Result(False, _MatchedSubstring(self, None), []) + return Result(False, _MatchedSubstring(self, None), []) def __str__(self): return "substring(%s)" % self.value @@ -269,9 +314,9 @@ def evaluate(self, ctx): # this is because `self` may match on many strings, so we can't stuff the matched value into it. # instead, return a new instance that has a reference to both the regex and the matched values. # see #262. - return capa.engine.Result(True, _MatchedRegex(self, matches), [], locations=locations) + return Result(True, _MatchedRegex(self, matches), [], locations=locations) else: - return capa.engine.Result(False, _MatchedRegex(self, None), []) + return Result(False, _MatchedRegex(self, None), []) def __str__(self): return "regex(string =~ %s)" % self.value @@ -326,9 +371,9 @@ def evaluate(self, ctx): continue if feature.value.startswith(self.value): - return capa.engine.Result(True, self, [], locations=locations) + return Result(True, self, [], locations=locations) - return capa.engine.Result(False, self, []) + return Result(False, self, []) def get_value_str(self): return hex_string(bytes_to_str(self.value)) From 5770d0c12d19dd40a8e95cc2c15032f8977d6449 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 11:50:49 -0700 Subject: [PATCH 09/19] perf: add reset routine --- capa/perf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/capa/perf.py b/capa/perf.py index d1e4083d4..c7b416c0b 100644 --- a/capa/perf.py +++ b/capa/perf.py @@ -1,3 +1,8 @@ import collections counters = collections.Counter() + + +def reset(): + global counters + counters = collections.Counter() From a35be4a666c26ec13d61ee30639d9c3894d129e1 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 11:51:09 -0700 Subject: [PATCH 10/19] scripts: add py script for profiling time --- scripts/profile-time.py | 115 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 scripts/profile-time.py diff --git a/scripts/profile-time.py b/scripts/profile-time.py new file mode 100644 index 000000000..3c47b67b4 --- /dev/null +++ b/scripts/profile-time.py @@ -0,0 +1,115 @@ +import sys +import timeit +import logging +import argparse +import subprocess + +import tqdm +import tabulate + +import capa.main +import capa.perf +import capa.rules +import capa.engine +import capa.helpers +import capa.features +import capa.features.common +import capa.features.freeze + +logger = logging.getLogger("capa.profile") + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + label = subprocess.run( + "git show --pretty=oneline --abbrev-commit | head -n 1", shell=True, capture_output=True, text=True + ).stdout.strip() + is_dirty = ( + subprocess.run( + "git status | grep 'modified: ' | grep -v 'rules' | grep -v 'tests/data'", + shell=True, + capture_output=True, + text=True, + ).stdout + != "" + ) + + if is_dirty: + label += " (dirty)" + + parser = argparse.ArgumentParser(description="Profile capa performance") + capa.main.install_common_args(parser, wanted={"format", "sample", "signatures", "rules"}) + + parser.add_argument("--number", type=int, default=3, help="batch size of profile collection") + parser.add_argument("--repeat", type=int, default=30, help="batch count of profile collection") + parser.add_argument("--label", type=str, default=label, help="description of the profile collection") + + args = parser.parse_args(args=argv) + capa.main.handle_common_args(args) + + try: + taste = capa.helpers.get_file_taste(args.sample) + except IOError as e: + logger.error("%s", str(e)) + return -1 + + try: + with capa.main.timing("load rules"): + rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True)) + except (IOError) as e: + logger.error("%s", str(e)) + return -1 + + try: + sig_paths = capa.main.get_signatures(args.signatures) + except (IOError) as e: + logger.error("%s", str(e)) + return -1 + + if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)): + with open(args.sample, "rb") as f: + extractor = capa.features.freeze.load(f.read()) + else: + extractor = capa.main.get_extractor( + args.sample, args.format, capa.main.BACKEND_VIV, sig_paths, should_save_workspace=False + ) + + with tqdm.tqdm(total=args.number * args.repeat) as pbar: + + def do_iteration(): + capa.perf.reset() + capa.main.find_capabilities(rules, extractor, disable_progress=True) + pbar.update(1) + + samples = timeit.repeat(do_iteration, number=args.number, repeat=args.repeat) + + logger.debug("perf: find capabilities: min: %0.2fs" % (min(samples) / float(args.number))) + logger.debug("perf: find capabilities: avg: %0.2fs" % (sum(samples) / float(args.repeat) / float(args.number))) + logger.debug("perf: find capabilities: max: %0.2fs" % (max(samples) / float(args.number))) + + for (counter, count) in capa.perf.counters.most_common(): + logger.debug("perf: counter: {:}: {:,}".format(counter, count)) + + print( + tabulate.tabulate( + [ + ( + args.label, + "{:,}".format(capa.perf.counters["evaluate.feature"]), + "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), + "%0.2fs" % (min(samples) / float(args.number)), + "%0.2fs" % (max(samples) / float(args.number)), + ) + ], + headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"], + tablefmt="github", + ) + ) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 70f007525d82df0b3980e954b2917aba9c781a4a Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 12:11:01 -0700 Subject: [PATCH 11/19] pep8 --- capa/engine.py | 18 +++++++++--------- capa/features/common.py | 10 +++++----- capa/rules.py | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index d6138c9c9..97f6d2958 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -81,7 +81,7 @@ def __init__(self, children, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.and"] += 1 + capa.perf.counters["evaluate.feature.and"] += 1 results = [child.evaluate(ctx) for child in self.children] success = all(results) @@ -97,8 +97,8 @@ def __init__(self, children, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.or"] += 1 - + capa.perf.counters["evaluate.feature.or"] += 1 + results = [child.evaluate(ctx) for child in self.children] success = any(results) return Result(success, self, results) @@ -113,8 +113,8 @@ def __init__(self, child, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.not"] += 1 - + capa.perf.counters["evaluate.feature.not"] += 1 + results = [self.child.evaluate(ctx)] success = not results[0] return Result(success, self, results) @@ -130,8 +130,8 @@ def __init__(self, count, children, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.some"] += 1 - + capa.perf.counters["evaluate.feature.some"] += 1 + results = [child.evaluate(ctx) for child in self.children] # note that here we cast the child result as a bool # because we've overridden `__bool__` above. @@ -152,8 +152,8 @@ def __init__(self, child, min=None, max=None, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.range"] += 1 - + capa.perf.counters["evaluate.feature.range"] += 1 + count = len(ctx.get(self.child, [])) if self.min == 0 and count == 0: return Result(True, self, []) diff --git a/capa/features/common.py b/capa/features/common.py index 0f01ef523..a40201e3a 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -190,7 +190,7 @@ def __init__(self, value: str, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.substring"] += 1 + capa.perf.counters["evaluate.feature.substring"] += 1 # mapping from string value to list of locations. # will unique the locations later on. @@ -278,8 +278,8 @@ def __init__(self, value: str, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.regex"] += 1 - + capa.perf.counters["evaluate.feature.regex"] += 1 + # mapping from string value to list of locations. # will unique the locations later on. matches = collections.defaultdict(list) @@ -364,8 +364,8 @@ def __init__(self, value: bytes, description=None): def evaluate(self, ctx): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.bytes"] += 1 - + capa.perf.counters["evaluate.feature.bytes"] += 1 + for feature, locations in ctx.items(): if not isinstance(feature, (Bytes,)): continue diff --git a/capa/rules.py b/capa/rules.py index 6960e02bf..2753f19dc 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -622,7 +622,7 @@ def extract_subscope_rules(self): def evaluate(self, features: FeatureSet): capa.perf.counters["evaluate.feature"] += 1 - capa.perf.counters["evaluate.feature.rule"] += 1 + capa.perf.counters["evaluate.feature.rule"] += 1 return self.statement.evaluate(features) @classmethod From b621205a06a70822e1081d5341c59f09dd2f6133 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 14:24:13 -0700 Subject: [PATCH 12/19] mypy --- capa/features/common.py | 6 +++++- capa/perf.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index a40201e3a..ac589593b 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -10,7 +10,11 @@ import codecs import logging import collections -from typing import Set, Dict, List, Union +from typing import TYPE_CHECKING, Set, Dict, List, Union + +if TYPE_CHECKING: + # circular import, otherwise + import capa.engine import capa.perf import capa.features diff --git a/capa/perf.py b/capa/perf.py index c7b416c0b..067c4f37a 100644 --- a/capa/perf.py +++ b/capa/perf.py @@ -1,6 +1,7 @@ import collections +from typing import Dict -counters = collections.Counter() +counters: Dict[str, int] = collections.Counter() def reset(): From f598acb8fc78e80063bb4ccff1910c6cc5ef1503 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 14:24:48 -0700 Subject: [PATCH 13/19] scripts: remove old profiling scripts --- scripts/profile-time.sh | 62 ---------------------------------- scripts/render-time-profile.sh | 24 ------------- 2 files changed, 86 deletions(-) delete mode 100644 scripts/profile-time.sh delete mode 100644 scripts/render-time-profile.sh diff --git a/scripts/profile-time.sh b/scripts/profile-time.sh deleted file mode 100644 index d09eabd3f..000000000 --- a/scripts/profile-time.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# unset variables are errors -set -o nounset; -# any failed commands are errors -set -o errexit; - -# current_directory is the path to the directory containing this script. -# ref: https://stackoverflow.com/a/4774063/87207 -readonly CD="$( cd "$(dirname "$0")" ; pwd -P )" - -panic() { - echo "[erro]: $@" >&2; - exit 1; -} - -info() { - echo "[info]: $@" >&2; -} - -verbose=false; -debug() { - if "$verbose"; then - echo "[debu]: $@" >&2; - fi -} - -if [ "$(git status | grep "modified: " | grep -v "rules" | grep -v "tests/data")" ]; then - panic "modified content"; -fi - -rev=$(git rev-parse --short HEAD); -info "rev: $rev"; - -mkdir -p "$CD/perf/"; - -info "analyzing PMA 01-01.dll..."; - -pma_out=$( - py-spy record \ - -o "$CD/perf/capa-$rev-PMA0101.svg" \ - -- python -m capa.main \ - -d \ - "$CD/../tests/data/Practical Malware Analysis Lab 01-01.dll_" \ - 2>&1 || true); - -echo "$pma_out" | grep "perf:" | sed -e "s/^.*perf: /perf: /g" | tee "$CD/perf/capa-$rev-PMA0101.txt"; - -info "analyzing kernel32.dll..."; -k32_out=$( - py-spy record \ - -o "$CD/perf/capa-$rev-k32.svg" \ - -- python -m capa.main \ - -d \ - "$CD/../tests/data/kernel32.dll_" \ - 2>&1 || true); - -echo "$k32_out" | grep "perf:" | sed -e "s/^.*perf: /perf: /g" | tee "$CD/perf/capa-$rev-k32.txt"; - -bash "$CD/render-time-profile.sh" "$rev"; - -info "done."; diff --git a/scripts/render-time-profile.sh b/scripts/render-time-profile.sh deleted file mode 100644 index 1b5627dce..000000000 --- a/scripts/render-time-profile.sh +++ /dev/null @@ -1,24 +0,0 @@ -fields=("load FLIRT" "viv analyze" "match functions" "match file" "find capabilities" "evaluate.feature"); - -echo -n "| |"; -for T in "${fields[@]}"; do - printf ' %-17s |' "$T"; -done -echo ""; - -echo -n "|---------|"; -for T in "${fields[@]}"; do - echo -n '-------------------|'; -done -echo ""; - -for rev in "$@"; do - - echo -n "| $rev |"; - for T in "${fields[@]}"; do - V1=$(cat scripts/perf/capa-$rev-PMA0101.txt | grep "$T: " | sed -e "s/^.*$T: //g"); - V2=$(cat scripts/perf/capa-$rev-k32.txt | grep "$T: " | sed -e "s/^.*$T: //g"); - printf ' %-17s |' "$V1/$V2"; - done - echo ""; -done From 6c8d246af91e1f0e0daa22b5f511ea0acbd40cf0 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 14:31:43 -0700 Subject: [PATCH 14/19] fix bad merge --- capa/features/common.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 75ed12ba5..3a4e71e94 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -146,13 +146,9 @@ def __str__(self): def __repr__(self): return str(self) -<<<<<<< HEAD - def evaluate(self, ctx: Dict["Feature", Set[int]]) -> "Result": + def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result: capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature." + self.name] += 1 -======= - def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result: ->>>>>>> master return Result(self in ctx, self, [], locations=ctx.get(self, [])) def freeze_serialize(self): From 334425a08f024be847a8ac827e10c2187e8b804b Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 15:16:08 -0700 Subject: [PATCH 15/19] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc4dac015..5c7c3ac6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ ### Development +- add profiling infrastructure #828 @williballenthin + ### Raw diffs - [capa ...master](https://github.com/mandiant/capa/compare/v3.0.3...master) - [capa-rules ...master](https://github.com/mandiant/capa-rules/compare/v3.0.3...master) From 6f6831f8124771aec22597a42fa2929213971016 Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 15:20:11 -0700 Subject: [PATCH 16/19] perf: document that counters is unstable --- capa/perf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/perf.py b/capa/perf.py index 067c4f37a..cb0e89eca 100644 --- a/capa/perf.py +++ b/capa/perf.py @@ -1,6 +1,7 @@ import collections from typing import Dict +# this structure is unstable and may change before the next major release. counters: Dict[str, int] = collections.Counter() From 2abebfbce72842cc5ab8fe1b6dbc196d386aecda Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 15:22:01 -0700 Subject: [PATCH 17/19] main: remove perf messages --- capa/main.py | 56 +++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/capa/main.py b/capa/main.py index 5aaa97d65..52c28d272 100644 --- a/capa/main.py +++ b/capa/main.py @@ -169,28 +169,27 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro n_funcs = len(functions) pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions") - with timing("match functions"): - for f in pb: - function_address = int(f) - - if extractor.is_library_function(function_address): - function_name = extractor.get_function_name(function_address) - logger.debug("skipping library function 0x%x (%s)", function_address, function_name) - meta["library_functions"][function_address] = function_name - n_libs = len(meta["library_functions"]) - percentage = 100 * (n_libs / n_funcs) - if isinstance(pb, tqdm.tqdm): - pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) - continue + for f in pb: + function_address = int(f) + + if extractor.is_library_function(function_address): + function_name = extractor.get_function_name(function_address) + logger.debug("skipping library function 0x%x (%s)", function_address, function_name) + meta["library_functions"][function_address] = function_name + n_libs = len(meta["library_functions"]) + percentage = 100 * (n_libs / n_funcs) + if isinstance(pb, tqdm.tqdm): + pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) + continue - function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) - meta["feature_counts"]["functions"][function_address] = feature_count - logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) + function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) + meta["feature_counts"]["functions"][function_address] = feature_count + logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) - for rule_name, res in function_matches.items(): - all_function_matches[rule_name].extend(res) - for rule_name, res in bb_matches.items(): - all_bb_matches[rule_name].extend(res) + for rule_name, res in function_matches.items(): + all_function_matches[rule_name].extend(res) + for rule_name, res in bb_matches.items(): + all_bb_matches[rule_name].extend(res) # collection of features that captures the rule matches within function and BB scopes. # mapping from feature (matched rule) to set of addresses at which it matched. @@ -200,8 +199,7 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro rule = ruleset[rule_name] capa.engine.index_rule_matches(function_and_lower_features, rule, locations) - with timing("match file"): - all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features) + all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features) meta["feature_counts"]["file"] = feature_count matches = { @@ -413,11 +411,9 @@ def get_workspace(path, format, sigpaths): else: raise ValueError("unexpected format: " + format) - with timing("load FLIRT"): - viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths) + viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths) - with timing("viv analyze"): - vw.analyze() + vw.analyze() logger.debug("%s", get_meta_str(vw)) return vw @@ -905,9 +901,8 @@ def main(argv=None): return E_MISSING_FILE try: - with timing("load rules"): - rules = get_rules(args.rules, disable_progress=args.quiet) - rules = capa.rules.RuleSet(rules) + rules = get_rules(args.rules, disable_progress=args.quiet) + rules = capa.rules.RuleSet(rules) logger.debug( "successfully loaded %s rules", @@ -1020,8 +1015,7 @@ def main(argv=None): meta = collect_metadata(argv, args.sample, args.rules, extractor) - with timing("find capabilities"): - capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) + capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) meta["analysis"].update(counts) meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities) From 18c30e4f12fbb3d639d59bff97bd3f88d76a236b Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Mon, 8 Nov 2021 15:24:43 -0700 Subject: [PATCH 18/19] main: remove perf debug msgs --- capa/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/capa/main.py b/capa/main.py index 52c28d272..6fe59a510 100644 --- a/capa/main.py +++ b/capa/main.py @@ -1019,9 +1019,6 @@ def main(argv=None): meta["analysis"].update(counts) meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities) - for (counter, count) in capa.perf.counters.most_common(): - logger.debug("perf: counter: {:}: {:,}".format(counter, count)) - if has_file_limitation(rules, capabilities): # bail if capa encountered file limitation e.g. a packed binary # do show the output in verbose mode, though. From 7a4aee592be171c7ba7fde03623a1596db831b9e Mon Sep 17 00:00:00 2001 From: William Ballenthin Date: Tue, 9 Nov 2021 16:08:39 -0700 Subject: [PATCH 19/19] profile-time: add doc --- scripts/profile-time.py | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/scripts/profile-time.py b/scripts/profile-time.py index 3c47b67b4..3d6b5e070 100644 --- a/scripts/profile-time.py +++ b/scripts/profile-time.py @@ -1,3 +1,34 @@ +""" +Invoke capa multiple times and record profiling informations. +Use the --number and --repeat options to change the number of iterations. +By default, the script will emit a markdown table with a label pulled from git. + +Note: you can run this script against pre-generated .frz files to reduce the startup time. + +usage: + + usage: profile-time.py [--number NUMBER] [--repeat REPEAT] [--label LABEL] sample + + Profile capa performance + + positional arguments: + sample path to sample to analyze + + optional arguments: + --number NUMBER batch size of profile collection + --repeat REPEAT batch count of profile collection + --label LABEL description of the profile collection + +example: + + $ python profile-time.py ./tests/data/kernel32.dll_.frz --number 1 --repeat 2 + + | label | count(evaluations) | avg(time) | min(time) | max(time) | + |--------------------------------------|----------------------|-------------|-------------|-------------| + | 18c30e4 main: remove perf debug msgs | 66,561,622 | 132.13s | 125.14s | 139.12s | + + ^^^ --label or git hash +""" import sys import timeit import logging @@ -98,12 +129,16 @@ def do_iteration(): ( args.label, "{:,}".format(capa.perf.counters["evaluate.feature"]), - "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), + # python documentation indicates that min(samples) should be preferred, + # so lets put that first. + # + # https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat "%0.2fs" % (min(samples) / float(args.number)), + "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)), "%0.2fs" % (max(samples) / float(args.number)), ) ], - headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"], + headers=["label", "count(evaluations)", "min(time)", "avg(time)", "max(time)"], tablefmt="github", ) )