mandiant · williballenthin · Nov 9, 2021 · Nov 4, 2021 · Nov 4, 2021 · Nov 4, 2021
diff --git a/.gitignore b/.gitignore
@@ -115,3 +115,6 @@ isort-output.log
 black-output.log
 rule-linter-output.log
 .vscode
+scripts/perf/*.txt
+scripts/perf/*.svg
+scripts/perf/*.zip
diff --git a/capa/engine.py b/capa/engine.py
@@ -10,6 +10,7 @@
 import collections
 from typing import Set, Dict, List, Tuple, Union, Mapping, Iterable
 
+import capa.perf
 import capa.rules
 import capa.features.common
 from capa.features.common import Result, Feature
@@ -79,6 +80,9 @@ def __init__(self, children, description=None):
         self.children = children
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.and"] += 1
+
         results = [child.evaluate(ctx) for child in self.children]
         success = all(results)
         return Result(success, self, results)
@@ -92,6 +96,9 @@ def __init__(self, children, description=None):
         self.children = children
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.or"] += 1
+
         results = [child.evaluate(ctx) for child in self.children]
         success = any(results)
         return Result(success, self, results)
@@ -105,6 +112,9 @@ def __init__(self, child, description=None):
         self.child = child
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.not"] += 1
+
         results = [self.child.evaluate(ctx)]
         success = not results[0]
         return Result(success, self, results)
@@ -119,6 +129,9 @@ def __init__(self, count, children, description=None):
         self.children = children
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.some"] += 1
+
         results = [child.evaluate(ctx) for child in self.children]
         # note that here we cast the child result as a bool
         # because we've overridden `__bool__` above.
@@ -138,6 +151,9 @@ def __init__(self, child, min=None, max=None, description=None):
         self.max = max if max is not None else (1 << 64 - 1)
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.range"] += 1
+
         count = len(ctx.get(self.child, []))
         if self.min == 0 and count == 0:
             return Result(True, self, [])

diff --git a/capa/features/common.py b/capa/features/common.py
@@ -16,6 +16,7 @@
     # circular import, otherwise
     import capa.engine
 
+import capa.perf
 import capa.features
 import capa.features.extractors.elf
 
@@ -146,6 +147,8 @@ def __repr__(self):
         return str(self)
 
     def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result:
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature." + self.name] += 1
         return Result(self in ctx, self, [], locations=ctx.get(self, []))
 
     def freeze_serialize(self):
@@ -190,6 +193,9 @@ def __init__(self, value: str, description=None):
         self.value = value
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.substring"] += 1
+
         # mapping from string value to list of locations.
         # will unique the locations later on.
         matches = collections.defaultdict(list)
@@ -275,6 +281,9 @@ def __init__(self, value: str, description=None):
             )
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.regex"] += 1
+
         # mapping from string value to list of locations.
         # will unique the locations later on.
         matches = collections.defaultdict(list)
@@ -358,6 +367,9 @@ def __init__(self, value: bytes, description=None):
         self.value = value
 
     def evaluate(self, ctx):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.bytes"] += 1
+
         for feature, locations in ctx.items():
             if not isinstance(feature, (Bytes,)):
                 continue

diff --git a/capa/main.py b/capa/main.py
@@ -10,13 +10,15 @@
 """
 import os
 import sys
+import time
 import hashlib
 import logging
 import os.path
 import argparse
 import datetime
 import textwrap
 import itertools
+import contextlib
 import collections
 from typing import Any, Dict, List, Tuple
 
@@ -26,6 +28,7 @@
 from pefile import PEFormatError
 from elftools.common.exceptions import ELFError
 
+import capa.perf
 import capa.rules
 import capa.engine
 import capa.version
@@ -65,6 +68,14 @@
 logger = logging.getLogger("capa")
 
 
+@contextlib.contextmanager
+def timing(msg: str):
+    t0 = time.time()
+    yield
+    t1 = time.time()
+    logger.debug("perf: %s: %0.2fs", msg, t1 - t0)
+
+
 def set_vivisect_log_level(level):
     logging.getLogger("vivisect").setLevel(level)
     logging.getLogger("vivisect.base").setLevel(level)
@@ -158,27 +169,28 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro
     n_funcs = len(functions)
 
     pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions")
-    for f in pb:
-        function_address = int(f)
-
-        if extractor.is_library_function(function_address):
-            function_name = extractor.get_function_name(function_address)
-            logger.debug("skipping library function 0x%x (%s)", function_address, function_name)
-            meta["library_functions"][function_address] = function_name
-            n_libs = len(meta["library_functions"])
-            percentage = 100 * (n_libs / n_funcs)
-            if isinstance(pb, tqdm.tqdm):
-                pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage))
-            continue
+    with timing("match functions"):
+        for f in pb:
+            function_address = int(f)
+
+            if extractor.is_library_function(function_address):
+                function_name = extractor.get_function_name(function_address)
+                logger.debug("skipping library function 0x%x (%s)", function_address, function_name)
+                meta["library_functions"][function_address] = function_name
+                n_libs = len(meta["library_functions"])
+                percentage = 100 * (n_libs / n_funcs)
+                if isinstance(pb, tqdm.tqdm):
+                    pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage))
+                continue
 
-        function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f)
-        meta["feature_counts"]["functions"][function_address] = feature_count
-        logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count)
+            function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f)
+            meta["feature_counts"]["functions"][function_address] = feature_count
+            logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count)
 
-        for rule_name, res in function_matches.items():
-            all_function_matches[rule_name].extend(res)
-        for rule_name, res in bb_matches.items():
-            all_bb_matches[rule_name].extend(res)
+            for rule_name, res in function_matches.items():
+                all_function_matches[rule_name].extend(res)
+            for rule_name, res in bb_matches.items():
+                all_bb_matches[rule_name].extend(res)
 
     # collection of features that captures the rule matches within function and BB scopes.
     # mapping from feature (matched rule) to set of addresses at which it matched.
@@ -188,7 +200,8 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro
         rule = ruleset[rule_name]
         capa.engine.index_rule_matches(function_and_lower_features, rule, locations)
 
-    all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features)
+    with timing("match file"):
+        all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features)
     meta["feature_counts"]["file"] = feature_count
 
     matches = {
@@ -400,9 +413,11 @@ def get_workspace(path, format, sigpaths):
     else:
         raise ValueError("unexpected format: " + format)
 
-    viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths)
+    with timing("load FLIRT"):
+        viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths)
 
-    vw.analyze()
+    with timing("viv analyze"):
+        vw.analyze()
 
     logger.debug("%s", get_meta_str(vw))
     return vw
@@ -890,8 +905,10 @@ def main(argv=None):
         return E_MISSING_FILE
 
     try:
-        rules = get_rules(args.rules, disable_progress=args.quiet)
-        rules = capa.rules.RuleSet(rules)
+        with timing("load rules"):
+            rules = get_rules(args.rules, disable_progress=args.quiet)
+            rules = capa.rules.RuleSet(rules)
+
         logger.debug(
             "successfully loaded %s rules",
             # during the load of the RuleSet, we extract subscope statements into their own rules
@@ -1003,10 +1020,14 @@ def main(argv=None):
 
     meta = collect_metadata(argv, args.sample, args.rules, extractor)
 
-    capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
+    with timing("find capabilities"):
+        capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
     meta["analysis"].update(counts)
     meta["analysis"]["layout"] = compute_layout(rules, extractor, capabilities)
 
+    for (counter, count) in capa.perf.counters.most_common():
+        logger.debug("perf: counter: {:}: {:,}".format(counter, count))
+
     if has_file_limitation(rules, capabilities):
         # bail if capa encountered file limitation e.g. a packed binary
         # do show the output in verbose mode, though.

diff --git a/capa/perf.py b/capa/perf.py
@@ -0,0 +1,9 @@
+import collections
+from typing import Dict
+
+counters: Dict[str, int] = collections.Counter()
+
+
+def reset():
+    global counters
+    counters = collections.Counter()
diff --git a/capa/rules.py b/capa/rules.py
@@ -27,6 +27,7 @@
 import yaml
 import ruamel.yaml
 
+import capa.perf
 import capa.engine as ceng
 import capa.features
 import capa.features.file
@@ -620,6 +621,8 @@ def extract_subscope_rules(self):
             yield new_rule
 
     def evaluate(self, features: FeatureSet):
+        capa.perf.counters["evaluate.feature"] += 1
+        capa.perf.counters["evaluate.feature.rule"] += 1
         return self.statement.evaluate(features)
 
     @classmethod

diff --git a/scripts/profile-time.py b/scripts/profile-time.py
@@ -0,0 +1,115 @@
+import sys
+import timeit
+import logging
+import argparse
+import subprocess
+
+import tqdm
+import tabulate
+
+import capa.main
+import capa.perf
+import capa.rules
+import capa.engine
+import capa.helpers
+import capa.features
+import capa.features.common
+import capa.features.freeze
+
+logger = logging.getLogger("capa.profile")
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    label = subprocess.run(
+        "git show --pretty=oneline --abbrev-commit | head -n 1", shell=True, capture_output=True, text=True
+    ).stdout.strip()
+    is_dirty = (
+        subprocess.run(
+            "git status | grep 'modified: ' | grep -v 'rules' | grep -v 'tests/data'",
+            shell=True,
+            capture_output=True,
+            text=True,
+        ).stdout
+        != ""
+    )
+
+    if is_dirty:
+        label += " (dirty)"
+
+    parser = argparse.ArgumentParser(description="Profile capa performance")
+    capa.main.install_common_args(parser, wanted={"format", "sample", "signatures", "rules"})
+
+    parser.add_argument("--number", type=int, default=3, help="batch size of profile collection")
+    parser.add_argument("--repeat", type=int, default=30, help="batch count of profile collection")
+    parser.add_argument("--label", type=str, default=label, help="description of the profile collection")
+
+    args = parser.parse_args(args=argv)
+    capa.main.handle_common_args(args)
+
+    try:
+        taste = capa.helpers.get_file_taste(args.sample)
+    except IOError as e:
+        logger.error("%s", str(e))
+        return -1
+
+    try:
+        with capa.main.timing("load rules"):
+            rules = capa.rules.RuleSet(capa.main.get_rules(args.rules, disable_progress=True))
+    except (IOError) as e:
+        logger.error("%s", str(e))
+        return -1
+
+    try:
+        sig_paths = capa.main.get_signatures(args.signatures)
+    except (IOError) as e:
+        logger.error("%s", str(e))
+        return -1
+
+    if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)):
+        with open(args.sample, "rb") as f:
+            extractor = capa.features.freeze.load(f.read())
+    else:
+        extractor = capa.main.get_extractor(
+            args.sample, args.format, capa.main.BACKEND_VIV, sig_paths, should_save_workspace=False
+        )
+
+    with tqdm.tqdm(total=args.number * args.repeat) as pbar:
+
+        def do_iteration():
+            capa.perf.reset()
+            capa.main.find_capabilities(rules, extractor, disable_progress=True)
+            pbar.update(1)
+
+        samples = timeit.repeat(do_iteration, number=args.number, repeat=args.repeat)
+
+    logger.debug("perf: find capabilities: min: %0.2fs" % (min(samples) / float(args.number)))
+    logger.debug("perf: find capabilities: avg: %0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)))
+    logger.debug("perf: find capabilities: max: %0.2fs" % (max(samples) / float(args.number)))
+
+    for (counter, count) in capa.perf.counters.most_common():
+        logger.debug("perf: counter: {:}: {:,}".format(counter, count))
+
+    print(
+        tabulate.tabulate(
+            [
+                (
+                    args.label,
+                    "{:,}".format(capa.perf.counters["evaluate.feature"]),
+                    "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)),
+                    "%0.2fs" % (min(samples) / float(args.number)),
+                    "%0.2fs" % (max(samples) / float(args.number)),
+                )
+            ],
+            headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"],
+            tablefmt="github",
+        )
+    )
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())