mandiant · linpeiyu164 · Mar 31, 2023 · Mar 31, 2023 · Apr 1, 2023 · Apr 1, 2023
diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py
@@ -36,14 +36,14 @@
 MATCH_RESULT = b'{"meta":'
 
 
-def extract_file_strings(buf, **kwargs) -> Iterator[Tuple[String, Address]]:
+def extract_file_strings(buf, min_len, **kwargs) -> Iterator[Tuple[String, Address]]:
     """
     extract ASCII and UTF-16 LE strings from file
     """
-    for s in capa.features.extractors.strings.extract_ascii_strings(buf):
+    for s in capa.features.extractors.strings.extract_ascii_strings(buf, min_len=min_len):
         yield String(s.s), FileOffsetAddress(s.offset)
 
-    for s in capa.features.extractors.strings.extract_unicode_strings(buf):
+    for s in capa.features.extractors.strings.extract_unicode_strings(buf, min_len=min_len):
         yield String(s.s), FileOffsetAddress(s.offset)
 
 
@@ -68,7 +68,7 @@ def extract_format(buf) -> Iterator[Tuple[Feature, Address]]:
 def extract_arch(buf) -> Iterator[Tuple[Feature, Address]]:
     if buf.startswith(MATCH_PE):
         yield from capa.features.extractors.pefile.extract_file_arch(pe=pefile.PE(data=buf))
-    
+
     elif buf.startswith(MATCH_RESULT):
         yield Arch(ARCH_ANY), NO_ADDRESS
 

diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py
@@ -20,6 +20,7 @@
 import capa.features.extractors.dnfile.function
 from capa.features.common import Feature
 from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress
+from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
 from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
 from capa.features.extractors.dnfile.helpers import (
@@ -68,9 +69,10 @@ def get_type(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]:
 
 
 class DnfileFeatureExtractor(FeatureExtractor):
-    def __init__(self, path: str):
+    def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
-    def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
+    def __init__(self, path: str, min_str_len: int = DEFAULT_STRING_LENGTH):
-    def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
+    def __init__(self, path: str, min_str_len: int = DEFAULT_STRING_LENGTH):
         super().__init__()
         self.pe: dnfile.dnPE = dnfile.dnPE(path)
+        self.min_len = min_len
-        self.min_len = min_len
+        self.min_str_len = min_str_len
-        self.min_len = min_len
+        self.min_str_len = min_str_len
 
         # pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction
         # most relevant at instruction scope
@@ -89,7 +91,9 @@ def extract_global_features(self):
         yield from self.global_features
 
     def extract_file_features(self):
-        yield from capa.features.extractors.dnfile.file.extract_features(self.pe)
+        yield from capa.features.extractors.dnfile.file.extract_features(
+            file_ctx={"pe": self.pe, "min_len": self.min_len}
-            file_ctx={"pe": self.pe, "min_len": self.min_len}
+            ctx={"pe": self.pe, "min_str_len": self.min_str_len}
-            file_ctx={"pe": self.pe, "min_len": self.min_len}
+            ctx={"pe": self.pe, "min_str_len": self.min_str_len}
+        )
 
     def get_functions(self) -> Iterator[FunctionHandle]:
         # create a method lookup table
@@ -98,7 +102,13 @@ def get_functions(self) -> Iterator[FunctionHandle]:
             fh: FunctionHandle = FunctionHandle(
                 address=DNTokenAddress(token),
                 inner=method,
-                ctx={"pe": self.pe, "calls_from": set(), "calls_to": set(), "cache": self.token_cache},
+                ctx={
+                    "pe": self.pe,
+                    "calls_from": set(),
+                    "calls_to": set(),
+                    "cache": self.token_cache,
+                    "min_len": self.min_len,
-                    "min_len": self.min_len,
+                    "min_str_len": self.min_len,
-                    "min_len": self.min_len,
+                    "min_str_len": self.min_len,
+                },
             )
 
             # method tokens should be unique

diff --git a/capa/features/extractors/dnfile/file.py b/capa/features/extractors/dnfile/file.py
@@ -18,37 +18,37 @@
 from capa.features.address import Address
 
 
-def extract_file_import_names(pe: dnfile.dnPE) -> Iterator[Tuple[Import, Address]]:
-    yield from capa.features.extractors.dotnetfile.extract_file_import_names(pe=pe)
+def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]:
-def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]:
+def extract_file_import_names(ctx) -> Iterator[Tuple[Import, Address]]:
-def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]:
+def extract_file_import_names(ctx) -> Iterator[Tuple[Import, Address]]:
+    yield from capa.features.extractors.dotnetfile.extract_file_import_names(file_ctx)
 
 
-def extract_file_format(pe: dnfile.dnPE) -> Iterator[Tuple[Format, Address]]:
-    yield from capa.features.extractors.dotnetfile.extract_file_format(pe=pe)
+def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]:
+    yield from capa.features.extractors.dotnetfile.extract_file_format()
 
 
-def extract_file_function_names(pe: dnfile.dnPE) -> Iterator[Tuple[FunctionName, Address]]:
-    yield from capa.features.extractors.dotnetfile.extract_file_function_names(pe=pe)
+def extract_file_function_names(file_ctx) -> Iterator[Tuple[FunctionName, Address]]:
+    yield from capa.features.extractors.dotnetfile.extract_file_function_names(file_ctx)
 
 
-def extract_file_strings(pe: dnfile.dnPE) -> Iterator[Tuple[String, Address]]:
-    yield from capa.features.extractors.dotnetfile.extract_file_strings(pe=pe)
+def extract_file_strings(file_ctx) -> Iterator[Tuple[String, Address]]:
+    yield from capa.features.extractors.dotnetfile.extract_file_strings(file_ctx)
 
 
-def extract_file_mixed_mode_characteristic_features(pe: dnfile.dnPE) -> Iterator[Tuple[Characteristic, Address]]:
-    yield from capa.features.extractors.dotnetfile.extract_file_mixed_mode_characteristic_features(pe=pe)
+def extract_file_mixed_mode_characteristic_features(file_ctx) -> Iterator[Tuple[Characteristic, Address]]:
+    yield from capa.features.extractors.dotnetfile.extract_file_mixed_mode_characteristic_features(file_ctx)
 
 
-def extract_file_namespace_features(pe: dnfile.dnPE) -> Iterator[Tuple[Namespace, Address]]:
-    yield from capa.features.extractors.dotnetfile.extract_file_namespace_features(pe=pe)
+def extract_file_namespace_features(file_ctx) -> Iterator[Tuple[Namespace, Address]]:
+    yield from capa.features.extractors.dotnetfile.extract_file_namespace_features(file_ctx)
 
 
-def extract_file_class_features(pe: dnfile.dnPE) -> Iterator[Tuple[Class, Address]]:
-    yield from capa.features.extractors.dotnetfile.extract_file_class_features(pe=pe)
+def extract_file_class_features(file_ctx) -> Iterator[Tuple[Class, Address]]:
+    yield from capa.features.extractors.dotnetfile.extract_file_class_features(file_ctx)
 
 
-def extract_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]]:
+def extract_features(file_ctx) -> Iterator[Tuple[Feature, Address]]:
     for file_handler in FILE_HANDLERS:
-        for feature, address in file_handler(pe):
+        for feature, address in file_handler(file_ctx=file_ctx):
             yield feature, address
 
 

diff --git a/capa/features/extractors/dnfile/insn.py b/capa/features/extractors/dnfile/insn.py
@@ -191,7 +191,7 @@ def extract_insn_string_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iter
     if user_string is None:
         return
 
-    if len(user_string) >= 4:
+    if len(user_string) >= fh.ctx["min_len"]:
-    if len(user_string) >= fh.ctx["min_len"]:
+    if len(user_string) >= fh.ctx["min_str_len"]:
-    if len(user_string) >= fh.ctx["min_len"]:
+    if len(user_string) >= fh.ctx["min_str_len"]:
         yield String(user_string), ih.address
 
 

diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py
@@ -23,6 +23,7 @@
     Characteristic,
 )
 from capa.features.address import NO_ADDRESS, Address, DNTokenAddress
+from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
 from capa.features.extractors.base_extractor import FeatureExtractor
 from capa.features.extractors.dnfile.helpers import (
     DnType,
@@ -42,7 +43,8 @@ def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]:
     yield Format(FORMAT_DOTNET), NO_ADDRESS
 
 
-def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Import, Address]]:
+def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]:
+    pe = file_ctx["pe"]
     for method in get_dotnet_managed_imports(pe):
         # like System.IO.File::OpenRead
         yield Import(str(method)), DNTokenAddress(method.token)
@@ -53,17 +55,17 @@ def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Impor
             yield Import(name), DNTokenAddress(imp.token)
 
 
-def extract_file_function_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[FunctionName, Address]]:
-    for method in get_dotnet_managed_methods(pe):
+def extract_file_function_names(file_ctx) -> Iterator[Tuple[FunctionName, Address]]:
+    for method in get_dotnet_managed_methods(file_ctx["pe"]):
         yield FunctionName(str(method)), DNTokenAddress(method.token)
 
 
-def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Namespace, Address]]:
+def extract_file_namespace_features(file_ctx) -> Iterator[Tuple[Namespace, Address]]:
     """emit namespace features from TypeRef and TypeDef tables"""
 
     # namespaces may be referenced multiple times, so we need to filter
     namespaces = set()
-
+    pe = file_ctx["pe"]
     for _, typedef in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number):
         # emit internal .NET namespaces
         assert isinstance(typedef, dnfile.mdtable.TypeDefRow)
@@ -82,8 +84,9 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple
         yield Namespace(namespace), NO_ADDRESS
 
 
-def extract_file_class_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Class, Address]]:
+def extract_file_class_features(file_ctx) -> Iterator[Tuple[Class, Address]]:
     """emit class features from TypeRef and TypeDef tables"""
+    pe = file_ctx["pe"]
     for rid, typedef in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number):
         # emit internal .NET classes
         assert isinstance(typedef, dnfile.mdtable.TypeDefRow)
@@ -117,20 +120,20 @@ def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Arch, Address
         yield Arch(ARCH_ANY), NO_ADDRESS
 
 
-def extract_file_strings(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[String, Address]]:
-    yield from capa.features.extractors.common.extract_file_strings(pe.__data__)
+def extract_file_strings(file_ctx) -> Iterator[Tuple[String, Address]]:
+    yield from capa.features.extractors.common.extract_file_strings(
+        file_ctx["pe"].__data__, min_len=file_ctx["min_len"]
+    )
 
 
-def extract_file_mixed_mode_characteristic_features(
-    pe: dnfile.dnPE, **kwargs
-) -> Iterator[Tuple[Characteristic, Address]]:
-    if is_dotnet_mixed_mode(pe):
+def extract_file_mixed_mode_characteristic_features(file_ctx) -> Iterator[Tuple[Characteristic, Address]]:
+    if is_dotnet_mixed_mode(file_ctx["pe"]):
         yield Characteristic("mixed mode"), NO_ADDRESS
 
 
-def extract_file_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]]:
+def extract_file_features(file_ctx) -> Iterator[Tuple[Feature, Address]]:
     for file_handler in FILE_HANDLERS:
-        for feature, addr in file_handler(pe=pe):  # type: ignore
+        for feature, addr in file_handler(file_ctx=file_ctx):  # type: ignore
             yield feature, addr
 
 
@@ -158,10 +161,11 @@ def extract_global_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]
 
 
 class DotnetFileFeatureExtractor(FeatureExtractor):
-    def __init__(self, path: str):
+    def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
         super().__init__()
         self.path: str = path
         self.pe: dnfile.dnPE = dnfile.dnPE(path)
+        self.min_len: int = min_len
 
     def get_base_address(self):
         return NO_ADDRESS
@@ -179,7 +183,7 @@ def extract_global_features(self):
         yield from extract_global_features(self.pe)
 
     def extract_file_features(self):
-        yield from extract_file_features(self.pe)
+        yield from extract_file_features(file_ctx={"pe": self.pe, "min_len": self.min_len})
 
     def is_dotnet_file(self) -> bool:
         return bool(self.pe.net)

diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py
@@ -15,14 +15,17 @@
 from capa.features.file import Import, Section
 from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature
 from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress
+from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
 from capa.features.extractors.base_extractor import FeatureExtractor
 
 logger = logging.getLogger(__name__)
 
 
-def extract_file_import_names(elf, **kwargs):
+def extract_file_import_names(file_ctx):
     # see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372
-    symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)]
+    symbol_tables = [
+        (idx, s) for idx, s in enumerate(file_ctx["elf"].iter_sections()) if isinstance(s, SymbolTableSection)
+    ]
 
     for _, section in symbol_tables:
         if not isinstance(section, SymbolTableSection):
@@ -41,16 +44,16 @@ def extract_file_import_names(elf, **kwargs):
                 yield Import(symbol.name), FileOffsetAddress(0x0)
 
 
-def extract_file_section_names(elf, **kwargs):
-    for section in elf.iter_sections():
+def extract_file_section_names(file_ctx):
+    for section in file_ctx["elf"].iter_sections():
         if section.name:
             yield Section(section.name), AbsoluteVirtualAddress(section.header.sh_addr)
         elif section.is_null():
             yield Section("NULL"), AbsoluteVirtualAddress(section.header.sh_addr)
 
 
-def extract_file_strings(buf, **kwargs):
-    yield from capa.features.extractors.common.extract_file_strings(buf)
+def extract_file_strings(file_ctx):
+    yield from capa.features.extractors.common.extract_file_strings(file_ctx["buf"], file_ctx["min_len"])
 
 
 def extract_file_os(elf, buf, **kwargs):
@@ -78,9 +81,9 @@ def extract_file_arch(elf, **kwargs):
         logger.warning("unsupported architecture: %s", arch)
 
 
-def extract_file_features(elf: ELFFile, buf: bytes) -> Iterator[Tuple[Feature, int]]:
+def extract_file_features(file_ctx) -> Iterator[Tuple[Feature, int]]:
     for file_handler in FILE_HANDLERS:
-        for feature, addr in file_handler(elf=elf, buf=buf):  # type: ignore
+        for feature, addr in file_handler(file_ctx=file_ctx):  # type: ignore
             yield feature, addr
 
 
@@ -107,9 +110,10 @@ def extract_global_features(elf: ELFFile, buf: bytes) -> Iterator[Tuple[Feature,
 
 
 class ElfFeatureExtractor(FeatureExtractor):
-    def __init__(self, path: str):
+    def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
         super().__init__()
         self.path = path
+        self.min_len = min_len
         with open(self.path, "rb") as f:
             self.elf = ELFFile(io.BytesIO(f.read()))
 
@@ -130,7 +134,7 @@ def extract_file_features(self):
         with open(self.path, "rb") as f:
             buf = f.read()
 
-        for feature, addr in extract_file_features(self.elf, buf):
+        for feature, addr in extract_file_features(file_ctx={"elf": self.elf, "buf": buf, "min_len": self.min_len}):
             yield feature, addr
 
     def get_functions(self):

diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py
@@ -18,16 +18,18 @@
 import capa.features.extractors.ida.basicblock
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
 
 
 class IdaFeatureExtractor(FeatureExtractor):
-    def __init__(self):
+    def __init__(self, min_len: int = DEFAULT_STRING_LENGTH):
         super().__init__()
         self.global_features: List[Tuple[Feature, Address]] = []
         self.global_features.extend(capa.features.extractors.ida.file.extract_file_format())
         self.global_features.extend(capa.features.extractors.ida.global_.extract_os())
         self.global_features.extend(capa.features.extractors.ida.global_.extract_arch())
+        self.min_len = min_len
 
     def get_base_address(self):
         return AbsoluteVirtualAddress(idaapi.get_imagebase())
@@ -36,18 +38,17 @@ def extract_global_features(self):
         yield from self.global_features
 
     def extract_file_features(self):
-        yield from capa.features.extractors.ida.file.extract_features()
+        yield from capa.features.extractors.ida.file.extract_features(file_ctx={"min_len": self.min_len})
 
     def get_functions(self) -> Iterator[FunctionHandle]:
         import capa.features.extractors.ida.helpers as ida_helpers
 
         # ignore library functions and thunk functions as identified by IDA
         yield from ida_helpers.get_functions(skip_thunks=True, skip_libs=True)
 
-    @staticmethod
-    def get_function(ea: int) -> FunctionHandle:
+    def get_function(self, ea: int) -> FunctionHandle:
         f = idaapi.get_func(ea)
-        return FunctionHandle(address=AbsoluteVirtualAddress(f.start_ea), inner=f)
+        return FunctionHandle(address=AbsoluteVirtualAddress(f.start_ea), inner=f, ctx={"min_len": self.min_len})
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
         yield from capa.features.extractors.ida.function.extract_features(fh)