diff --git a/CHANGELOG.md b/CHANGELOG.md index d88b1f4d1..6f83d8a57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,7 @@ - show-features: better render strings with embedded whitespace #1267 @williballenthin - handle vivisect bug around strings at instruction level, use min length 4 #1271 @williballenthin @mr-tz - extractor: guard against invalid "calls from" features #1177 @mr-tz +- extractor: add format to global features #1258 @mr-tz ### capa explorer IDA Pro plugin - fix: display instruction items #1154 @mr-tz diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index bd4b9c9e9..9d7c6c4c5 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -8,13 +8,13 @@ from __future__ import annotations -from enum import Enum from typing import Dict, List, Tuple, Union, Iterator, Optional import dnfile from dncil.cil.opcode import OpCodes import capa.features.extractors +import capa.features.extractors.dotnetfile import capa.features.extractors.dnfile.file import capa.features.extractors.dnfile.insn import capa.features.extractors.dnfile.function @@ -78,6 +78,7 @@ def __init__(self, path: str): # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_format()) self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_os(pe=self.pe)) self.global_features.extend(capa.features.extractors.dotnetfile.extract_file_arch(pe=self.pe)) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 1a587fa69..0d44ba9e1 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -25,6 +25,7 @@ class IdaFeatureExtractor(FeatureExtractor): def __init__(self): super().__init__() self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index 53683f667..e2d0fb1e1 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -34,6 +34,7 @@ def __init__(self, vw, path): # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend(capa.features.extractors.viv.file.extract_file_format(self.buf)) self.global_features.extend(capa.features.extractors.common.extract_os(self.buf)) self.global_features.extend(capa.features.extractors.viv.global_.extract_arch(self.vw)) diff --git a/capa/helpers.py b/capa/helpers.py index 2e44fc6c3..a2edc812b 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -10,7 +10,7 @@ from typing import NoReturn from capa.exceptions import UnsupportedFormatError -from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_UNKNOWN +from capa.features.common import FORMAT_PE, FORMAT_SC32, FORMAT_SC64, FORMAT_DOTNET, FORMAT_UNKNOWN, Format EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") @@ -68,11 +68,17 @@ def get_auto_format(path: str) -> str: def get_format(sample: str) -> str: # imported locally to avoid import cycle from capa.features.extractors.common import extract_format + from capa.features.extractors.dnfile_ import DnfileFeatureExtractor with open(sample, "rb") as f: buf = f.read() for feature, _ in extract_format(buf): + if feature == Format(FORMAT_PE): + dnfile_extractor = DnfileFeatureExtractor(sample) + if dnfile_extractor.is_dotnet_file(): + feature = Format(FORMAT_DOTNET) + assert isinstance(feature.value, str) return feature.value diff --git a/capa/main.py b/capa/main.py index 6262b7d0c..1741fad9e 100644 --- a/capa/main.py +++ b/capa/main.py @@ -20,7 +20,7 @@ import itertools import contextlib import collections -from typing import Any, Dict, List, Tuple, Optional +from typing import Any, Dict, List, Tuple import halo import tqdm @@ -535,12 +535,12 @@ def get_extractor( def get_file_extractors(sample: str, format_: str) -> List[FeatureExtractor]: file_extractors: List[FeatureExtractor] = list() - if format_ == capa.features.extractors.common.FORMAT_PE: + if format_ == FORMAT_PE: file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample)) - dnfile_extractor = capa.features.extractors.dnfile_.DnfileFeatureExtractor(sample) - if dnfile_extractor.is_dotnet_file(): - file_extractors.append(dnfile_extractor) + elif format_ == FORMAT_DOTNET: + file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample)) + file_extractors.append(capa.features.extractors.dnfile_.DnfileFeatureExtractor(sample)) elif format_ == capa.features.extractors.common.FORMAT_ELF: file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample)) @@ -646,7 +646,6 @@ def collect_metadata( sample_path: str, rules_path: List[str], extractor: capa.features.extractors.base_extractor.FeatureExtractor, - format_: Optional[str] = None, ): md5 = hashlib.md5() sha1 = hashlib.sha1() @@ -662,8 +661,7 @@ def collect_metadata( if rules_path != [RULES_PATH_DEFAULT_STRING]: rules_path = [os.path.abspath(os.path.normpath(r)) for r in rules_path] - if format_ is None: - format_ = get_format(sample_path) + format_ = get_format(sample_path) arch = get_arch(sample_path) os_ = get_os(sample_path) @@ -996,6 +994,9 @@ def main(argv=None): if format_ == FORMAT_AUTO: try: format_ = get_auto_format(args.sample) + except PEFormatError as e: + logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e)) + return E_CORRUPT_FILE except UnsupportedFormatError: log_unsupported_format_error() return E_INVALID_FILE_TYPE @@ -1058,9 +1059,6 @@ def main(argv=None): logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e)) return E_CORRUPT_FILE - if isinstance(file_extractor, capa.features.extractors.dnfile_.DnfileFeatureExtractor): - format_ = FORMAT_DOTNET - # file limitations that rely on non-file scope won't be detected here. # nor on FunctionName features, because pefile doesn't support this. if has_file_limitation(rules, pure_file_capabilities): @@ -1100,7 +1098,7 @@ def main(argv=None): log_unsupported_os_error() return E_INVALID_FILE_OS - meta = collect_metadata(argv, args.sample, args.rules, extractor, format_=format_) + meta = collect_metadata(argv, args.sample, args.rules, extractor) capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) meta["analysis"].update(counts) diff --git a/scripts/lint.py b/scripts/lint.py index ed6e8b177..767cd0a8f 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -307,11 +307,7 @@ def get_sample_capabilities(ctx: Context, path: Path) -> Set[str]: elif nice_path.endswith(capa.helpers.EXTENSIONS_SHELLCODE_64): format_ = "sc64" else: - format_ = "auto" - if not nice_path.endswith(capa.helpers.EXTENSIONS_ELF): - dnfile_extractor = capa.features.extractors.dnfile_.DnfileFeatureExtractor(nice_path) - if dnfile_extractor.is_dotnet_file(): - format_ = FORMAT_DOTNET + format_ = capa.main.get_auto_format(nice_path) logger.debug("analyzing sample: %s", nice_path) extractor = capa.main.get_extractor(nice_path, format_, "", DEFAULT_SIGNATURES, False, disable_progress=True) diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py index 7f0b13b6d..f4303fef9 100644 --- a/scripts/show-capabilities-by-function.py +++ b/scripts/show-capabilities-by-function.py @@ -175,7 +175,7 @@ def main(argv=None): capa.helpers.log_unsupported_runtime_error() return -1 - meta = capa.main.collect_metadata(argv, args.sample, args.rules, extractor, format_=format_) + meta = capa.main.collect_metadata(argv, args.sample, args.rules, extractor) capabilities, counts = capa.main.find_capabilities(rules, extractor) meta["analysis"].update(counts) meta["analysis"]["layout"] = capa.main.compute_layout(rules, extractor, capabilities) diff --git a/tests/fixtures.py b/tests/fixtures.py index 734f64a47..ae336f562 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -689,14 +689,22 @@ def parametrize(params, values, **kwargs): # os & format & arch ("pma16-01", "file", OS(OS_WINDOWS), True), ("pma16-01", "file", OS(OS_LINUX), False), + ("mimikatz", "file", OS(OS_WINDOWS), True), ("pma16-01", "function=0x404356", OS(OS_WINDOWS), True), ("pma16-01", "function=0x404356,bb=0x4043B9", OS(OS_WINDOWS), True), + ("mimikatz", "function=0x40105D", OS(OS_WINDOWS), True), ("pma16-01", "file", Arch(ARCH_I386), True), ("pma16-01", "file", Arch(ARCH_AMD64), False), + ("mimikatz", "file", Arch(ARCH_I386), True), ("pma16-01", "function=0x404356", Arch(ARCH_I386), True), ("pma16-01", "function=0x404356,bb=0x4043B9", Arch(ARCH_I386), True), + ("mimikatz", "function=0x40105D", Arch(ARCH_I386), True), ("pma16-01", "file", Format(FORMAT_PE), True), ("pma16-01", "file", Format(FORMAT_ELF), False), + ("mimikatz", "file", Format(FORMAT_PE), True), + # format is also a global feature + ("pma16-01", "function=0x404356", Format(FORMAT_PE), True), + ("mimikatz", "function=0x456BB9", Format(FORMAT_PE), True), # elf support ("7351f.elf", "file", OS(OS_LINUX), True), ("7351f.elf", "file", OS(OS_WINDOWS), False),