Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make string length configurable and consistent across backends #1421

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions capa/features/extractors/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@
MATCH_RESULT = b'{"meta":'


def extract_file_strings(buf, **kwargs) -> Iterator[Tuple[String, Address]]:
def extract_file_strings(buf, min_len, **kwargs) -> Iterator[Tuple[String, Address]]:
"""
extract ASCII and UTF-16 LE strings from file
"""
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
for s in capa.features.extractors.strings.extract_ascii_strings(buf, min_len=min_len):
yield String(s.s), FileOffsetAddress(s.offset)

for s in capa.features.extractors.strings.extract_unicode_strings(buf):
for s in capa.features.extractors.strings.extract_unicode_strings(buf, min_len=min_len):
yield String(s.s), FileOffsetAddress(s.offset)
Comment on lines -39 to 47
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great



Expand All @@ -68,7 +68,7 @@ def extract_format(buf) -> Iterator[Tuple[Feature, Address]]:
def extract_arch(buf) -> Iterator[Tuple[Feature, Address]]:
if buf.startswith(MATCH_PE):
yield from capa.features.extractors.pefile.extract_file_arch(pe=pefile.PE(data=buf))

elif buf.startswith(MATCH_RESULT):
yield Arch(ARCH_ANY), NO_ADDRESS

Expand Down
16 changes: 13 additions & 3 deletions capa/features/extractors/dnfile/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import capa.features.extractors.dnfile.function
from capa.features.common import Feature
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress
from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor
from capa.features.extractors.dnfile.helpers import (
Expand Down Expand Up @@ -68,9 +69,10 @@ def get_type(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]:


class DnfileFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
def __init__(self, path: str, min_str_len: int = DEFAULT_STRING_LENGTH):

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's not enough context here to infer "min_len" of what? so lets include "str" in the variable/property names

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since dnfile/file.py uses functions from dotnetfile.py, I changed extract_features in dnfile/file.py to alter the ctx variable for the dotnetfile functions ( ctx = {"min_len": ctx["min_str_len"]} ). I didn't change the other functions in dnfile/file.py to make them more concise. But I'm not sure if the inconsistency would be confusing?

super().__init__()
self.pe: dnfile.dnPE = dnfile.dnPE(path)
self.min_len = min_len
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.min_len = min_len
self.min_str_len = min_str_len


# pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction
# most relevant at instruction scope
Expand All @@ -89,7 +91,9 @@ def extract_global_features(self):
yield from self.global_features

def extract_file_features(self):
yield from capa.features.extractors.dnfile.file.extract_features(self.pe)
yield from capa.features.extractors.dnfile.file.extract_features(
file_ctx={"pe": self.pe, "min_len": self.min_len}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
file_ctx={"pe": self.pe, "min_len": self.min_len}
ctx={"pe": self.pe, "min_str_len": self.min_str_len}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given that we're invoking the "file" extractor, its not necessary to include the term file_ in the parameter name.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while this isn't wrong, I'd just like to keep the code as concise and consistent as possible.

)

def get_functions(self) -> Iterator[FunctionHandle]:
# create a method lookup table
Expand All @@ -98,7 +102,13 @@ def get_functions(self) -> Iterator[FunctionHandle]:
fh: FunctionHandle = FunctionHandle(
address=DNTokenAddress(token),
inner=method,
ctx={"pe": self.pe, "calls_from": set(), "calls_to": set(), "cache": self.token_cache},
ctx={
"pe": self.pe,
"calls_from": set(),
"calls_to": set(),
"cache": self.token_cache,
"min_len": self.min_len,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"min_len": self.min_len,
"min_str_len": self.min_len,

},
)

# method tokens should be unique
Expand Down
32 changes: 16 additions & 16 deletions capa/features/extractors/dnfile/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,37 @@
from capa.features.address import Address


def extract_file_import_names(pe: dnfile.dnPE) -> Iterator[Tuple[Import, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_import_names(pe=pe)
def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]:
def extract_file_import_names(ctx) -> Iterator[Tuple[Import, Address]]:

yield from capa.features.extractors.dotnetfile.extract_file_import_names(file_ctx)


def extract_file_format(pe: dnfile.dnPE) -> Iterator[Tuple[Format, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_format(pe=pe)
def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_format()


def extract_file_function_names(pe: dnfile.dnPE) -> Iterator[Tuple[FunctionName, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_function_names(pe=pe)
def extract_file_function_names(file_ctx) -> Iterator[Tuple[FunctionName, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_function_names(file_ctx)


def extract_file_strings(pe: dnfile.dnPE) -> Iterator[Tuple[String, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_strings(pe=pe)
def extract_file_strings(file_ctx) -> Iterator[Tuple[String, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_strings(file_ctx)


def extract_file_mixed_mode_characteristic_features(pe: dnfile.dnPE) -> Iterator[Tuple[Characteristic, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_mixed_mode_characteristic_features(pe=pe)
def extract_file_mixed_mode_characteristic_features(file_ctx) -> Iterator[Tuple[Characteristic, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_mixed_mode_characteristic_features(file_ctx)


def extract_file_namespace_features(pe: dnfile.dnPE) -> Iterator[Tuple[Namespace, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_namespace_features(pe=pe)
def extract_file_namespace_features(file_ctx) -> Iterator[Tuple[Namespace, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_namespace_features(file_ctx)


def extract_file_class_features(pe: dnfile.dnPE) -> Iterator[Tuple[Class, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_class_features(pe=pe)
def extract_file_class_features(file_ctx) -> Iterator[Tuple[Class, Address]]:
yield from capa.features.extractors.dotnetfile.extract_file_class_features(file_ctx)


def extract_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]]:
def extract_features(file_ctx) -> Iterator[Tuple[Feature, Address]]:
for file_handler in FILE_HANDLERS:
for feature, address in file_handler(pe):
for feature, address in file_handler(file_ctx=file_ctx):
yield feature, address


Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/dnfile/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def extract_insn_string_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iter
if user_string is None:
return

if len(user_string) >= 4:
if len(user_string) >= fh.ctx["min_len"]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if len(user_string) >= fh.ctx["min_len"]:
if len(user_string) >= fh.ctx["min_str_len"]:

yield String(user_string), ih.address


Expand Down
36 changes: 20 additions & 16 deletions capa/features/extractors/dotnetfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
Characteristic,
)
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress
from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
from capa.features.extractors.base_extractor import FeatureExtractor
from capa.features.extractors.dnfile.helpers import (
DnType,
Expand All @@ -42,7 +43,8 @@ def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]:
yield Format(FORMAT_DOTNET), NO_ADDRESS


def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Import, Address]]:
def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]:
pe = file_ctx["pe"]
for method in get_dotnet_managed_imports(pe):
# like System.IO.File::OpenRead
yield Import(str(method)), DNTokenAddress(method.token)
Expand All @@ -53,17 +55,17 @@ def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Impor
yield Import(name), DNTokenAddress(imp.token)


def extract_file_function_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[FunctionName, Address]]:
for method in get_dotnet_managed_methods(pe):
def extract_file_function_names(file_ctx) -> Iterator[Tuple[FunctionName, Address]]:
for method in get_dotnet_managed_methods(file_ctx["pe"]):
yield FunctionName(str(method)), DNTokenAddress(method.token)


def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Namespace, Address]]:
def extract_file_namespace_features(file_ctx) -> Iterator[Tuple[Namespace, Address]]:
"""emit namespace features from TypeRef and TypeDef tables"""

# namespaces may be referenced multiple times, so we need to filter
namespaces = set()

pe = file_ctx["pe"]
for _, typedef in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number):
# emit internal .NET namespaces
assert isinstance(typedef, dnfile.mdtable.TypeDefRow)
Expand All @@ -82,8 +84,9 @@ def extract_file_namespace_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple
yield Namespace(namespace), NO_ADDRESS


def extract_file_class_features(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Class, Address]]:
def extract_file_class_features(file_ctx) -> Iterator[Tuple[Class, Address]]:
"""emit class features from TypeRef and TypeDef tables"""
pe = file_ctx["pe"]
for rid, typedef in iter_dotnet_table(pe, dnfile.mdtable.TypeDef.number):
# emit internal .NET classes
assert isinstance(typedef, dnfile.mdtable.TypeDefRow)
Expand Down Expand Up @@ -117,20 +120,20 @@ def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Arch, Address
yield Arch(ARCH_ANY), NO_ADDRESS


def extract_file_strings(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[String, Address]]:
yield from capa.features.extractors.common.extract_file_strings(pe.__data__)
def extract_file_strings(file_ctx) -> Iterator[Tuple[String, Address]]:
yield from capa.features.extractors.common.extract_file_strings(
file_ctx["pe"].__data__, min_len=file_ctx["min_len"]
)


def extract_file_mixed_mode_characteristic_features(
pe: dnfile.dnPE, **kwargs
) -> Iterator[Tuple[Characteristic, Address]]:
if is_dotnet_mixed_mode(pe):
def extract_file_mixed_mode_characteristic_features(file_ctx) -> Iterator[Tuple[Characteristic, Address]]:
if is_dotnet_mixed_mode(file_ctx["pe"]):
yield Characteristic("mixed mode"), NO_ADDRESS


def extract_file_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]]:
def extract_file_features(file_ctx) -> Iterator[Tuple[Feature, Address]]:
for file_handler in FILE_HANDLERS:
for feature, addr in file_handler(pe=pe): # type: ignore
for feature, addr in file_handler(file_ctx=file_ctx): # type: ignore
yield feature, addr


Expand Down Expand Up @@ -158,10 +161,11 @@ def extract_global_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]


class DotnetFileFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
super().__init__()
self.path: str = path
self.pe: dnfile.dnPE = dnfile.dnPE(path)
self.min_len: int = min_len

def get_base_address(self):
return NO_ADDRESS
Expand All @@ -179,7 +183,7 @@ def extract_global_features(self):
yield from extract_global_features(self.pe)

def extract_file_features(self):
yield from extract_file_features(self.pe)
yield from extract_file_features(file_ctx={"pe": self.pe, "min_len": self.min_len})

def is_dotnet_file(self) -> bool:
return bool(self.pe.net)
Expand Down
24 changes: 14 additions & 10 deletions capa/features/extractors/elffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@
from capa.features.file import Import, Section
from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature
from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
from capa.features.extractors.base_extractor import FeatureExtractor

logger = logging.getLogger(__name__)


def extract_file_import_names(elf, **kwargs):
def extract_file_import_names(file_ctx):
# see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372
symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)]
symbol_tables = [
(idx, s) for idx, s in enumerate(file_ctx["elf"].iter_sections()) if isinstance(s, SymbolTableSection)
]

for _, section in symbol_tables:
if not isinstance(section, SymbolTableSection):
Expand All @@ -41,16 +44,16 @@ def extract_file_import_names(elf, **kwargs):
yield Import(symbol.name), FileOffsetAddress(0x0)


def extract_file_section_names(elf, **kwargs):
for section in elf.iter_sections():
def extract_file_section_names(file_ctx):
for section in file_ctx["elf"].iter_sections():
if section.name:
yield Section(section.name), AbsoluteVirtualAddress(section.header.sh_addr)
elif section.is_null():
yield Section("NULL"), AbsoluteVirtualAddress(section.header.sh_addr)


def extract_file_strings(buf, **kwargs):
yield from capa.features.extractors.common.extract_file_strings(buf)
def extract_file_strings(file_ctx):
yield from capa.features.extractors.common.extract_file_strings(file_ctx["buf"], file_ctx["min_len"])


def extract_file_os(elf, buf, **kwargs):
Expand Down Expand Up @@ -78,9 +81,9 @@ def extract_file_arch(elf, **kwargs):
logger.warning("unsupported architecture: %s", arch)


def extract_file_features(elf: ELFFile, buf: bytes) -> Iterator[Tuple[Feature, int]]:
def extract_file_features(file_ctx) -> Iterator[Tuple[Feature, int]]:
for file_handler in FILE_HANDLERS:
for feature, addr in file_handler(elf=elf, buf=buf): # type: ignore
for feature, addr in file_handler(file_ctx=file_ctx): # type: ignore
yield feature, addr


Expand All @@ -107,9 +110,10 @@ def extract_global_features(elf: ELFFile, buf: bytes) -> Iterator[Tuple[Feature,


class ElfFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH):
super().__init__()
self.path = path
self.min_len = min_len
with open(self.path, "rb") as f:
self.elf = ELFFile(io.BytesIO(f.read()))

Expand All @@ -130,7 +134,7 @@ def extract_file_features(self):
with open(self.path, "rb") as f:
buf = f.read()

for feature, addr in extract_file_features(self.elf, buf):
for feature, addr in extract_file_features(file_ctx={"elf": self.elf, "buf": buf, "min_len": self.min_len}):
yield feature, addr

def get_functions(self):
Expand Down
11 changes: 6 additions & 5 deletions capa/features/extractors/ida/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,18 @@
import capa.features.extractors.ida.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.strings import DEFAULT_STRING_LENGTH
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor


class IdaFeatureExtractor(FeatureExtractor):
def __init__(self):
def __init__(self, min_len: int = DEFAULT_STRING_LENGTH):
super().__init__()
self.global_features: List[Tuple[Feature, Address]] = []
self.global_features.extend(capa.features.extractors.ida.file.extract_file_format())
self.global_features.extend(capa.features.extractors.ida.global_.extract_os())
self.global_features.extend(capa.features.extractors.ida.global_.extract_arch())
self.min_len = min_len

def get_base_address(self):
return AbsoluteVirtualAddress(idaapi.get_imagebase())
Expand All @@ -36,18 +38,17 @@ def extract_global_features(self):
yield from self.global_features

def extract_file_features(self):
yield from capa.features.extractors.ida.file.extract_features()
yield from capa.features.extractors.ida.file.extract_features(file_ctx={"min_len": self.min_len})

def get_functions(self) -> Iterator[FunctionHandle]:
import capa.features.extractors.ida.helpers as ida_helpers

# ignore library functions and thunk functions as identified by IDA
yield from ida_helpers.get_functions(skip_thunks=True, skip_libs=True)

@staticmethod
def get_function(ea: int) -> FunctionHandle:
def get_function(self, ea: int) -> FunctionHandle:
f = idaapi.get_func(ea)
return FunctionHandle(address=AbsoluteVirtualAddress(f.start_ea), inner=f)
return FunctionHandle(address=AbsoluteVirtualAddress(f.start_ea), inner=f, ctx={"min_len": self.min_len})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good change


def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.ida.function.extract_features(fh)
Expand Down
Loading