-
Notifications
You must be signed in to change notification settings - Fork 578
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
make string length configurable and consistent across backends #1421
Changes from 10 commits
e86a06b
ad87e84
9f71ac4
3c5d538
7613e83
dc47ffa
e2a30bc
7934890
dfd8641
1fcfe62
021ede8
18f3135
b912804
01b322a
5520761
b6d3c44
8167686
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -20,6 +20,7 @@ | |||||
import capa.features.extractors.dnfile.function | ||||||
from capa.features.common import Feature | ||||||
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress | ||||||
from capa.features.extractors.strings import DEFAULT_STRING_LENGTH | ||||||
from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod | ||||||
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor | ||||||
from capa.features.extractors.dnfile.helpers import ( | ||||||
|
@@ -68,9 +69,10 @@ def get_type(self, token: int) -> Optional[Union[DnType, DnUnmanagedMethod]]: | |||||
|
||||||
|
||||||
class DnfileFeatureExtractor(FeatureExtractor): | ||||||
def __init__(self, path: str): | ||||||
def __init__(self, path: str, min_len: int = DEFAULT_STRING_LENGTH): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's not enough context here to infer "min_len" of what? so lets include "str" in the variable/property names There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since dnfile/file.py uses functions from dotnetfile.py, I changed |
||||||
super().__init__() | ||||||
self.pe: dnfile.dnPE = dnfile.dnPE(path) | ||||||
self.min_len = min_len | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
# pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction | ||||||
# most relevant at instruction scope | ||||||
|
@@ -89,7 +91,9 @@ def extract_global_features(self): | |||||
yield from self.global_features | ||||||
|
||||||
def extract_file_features(self): | ||||||
yield from capa.features.extractors.dnfile.file.extract_features(self.pe) | ||||||
yield from capa.features.extractors.dnfile.file.extract_features( | ||||||
file_ctx={"pe": self.pe, "min_len": self.min_len} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given that we're invoking the "file" extractor, its not necessary to include the term There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. while this isn't wrong, I'd just like to keep the code as concise and consistent as possible. |
||||||
) | ||||||
|
||||||
def get_functions(self) -> Iterator[FunctionHandle]: | ||||||
# create a method lookup table | ||||||
|
@@ -98,7 +102,13 @@ def get_functions(self) -> Iterator[FunctionHandle]: | |||||
fh: FunctionHandle = FunctionHandle( | ||||||
address=DNTokenAddress(token), | ||||||
inner=method, | ||||||
ctx={"pe": self.pe, "calls_from": set(), "calls_to": set(), "cache": self.token_cache}, | ||||||
ctx={ | ||||||
"pe": self.pe, | ||||||
"calls_from": set(), | ||||||
"calls_to": set(), | ||||||
"cache": self.token_cache, | ||||||
"min_len": self.min_len, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
}, | ||||||
) | ||||||
|
||||||
# method tokens should be unique | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -18,37 +18,37 @@ | |||||
from capa.features.address import Address | ||||||
|
||||||
|
||||||
def extract_file_import_names(pe: dnfile.dnPE) -> Iterator[Tuple[Import, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_import_names(pe=pe) | ||||||
def extract_file_import_names(file_ctx) -> Iterator[Tuple[Import, Address]]: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
yield from capa.features.extractors.dotnetfile.extract_file_import_names(file_ctx) | ||||||
|
||||||
|
||||||
def extract_file_format(pe: dnfile.dnPE) -> Iterator[Tuple[Format, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_format(pe=pe) | ||||||
def extract_file_format(**kwargs) -> Iterator[Tuple[Format, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_format() | ||||||
|
||||||
|
||||||
def extract_file_function_names(pe: dnfile.dnPE) -> Iterator[Tuple[FunctionName, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_function_names(pe=pe) | ||||||
def extract_file_function_names(file_ctx) -> Iterator[Tuple[FunctionName, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_function_names(file_ctx) | ||||||
|
||||||
|
||||||
def extract_file_strings(pe: dnfile.dnPE) -> Iterator[Tuple[String, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_strings(pe=pe) | ||||||
def extract_file_strings(file_ctx) -> Iterator[Tuple[String, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_strings(file_ctx) | ||||||
|
||||||
|
||||||
def extract_file_mixed_mode_characteristic_features(pe: dnfile.dnPE) -> Iterator[Tuple[Characteristic, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_mixed_mode_characteristic_features(pe=pe) | ||||||
def extract_file_mixed_mode_characteristic_features(file_ctx) -> Iterator[Tuple[Characteristic, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_mixed_mode_characteristic_features(file_ctx) | ||||||
|
||||||
|
||||||
def extract_file_namespace_features(pe: dnfile.dnPE) -> Iterator[Tuple[Namespace, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_namespace_features(pe=pe) | ||||||
def extract_file_namespace_features(file_ctx) -> Iterator[Tuple[Namespace, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_namespace_features(file_ctx) | ||||||
|
||||||
|
||||||
def extract_file_class_features(pe: dnfile.dnPE) -> Iterator[Tuple[Class, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_class_features(pe=pe) | ||||||
def extract_file_class_features(file_ctx) -> Iterator[Tuple[Class, Address]]: | ||||||
yield from capa.features.extractors.dotnetfile.extract_file_class_features(file_ctx) | ||||||
|
||||||
|
||||||
def extract_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, Address]]: | ||||||
def extract_features(file_ctx) -> Iterator[Tuple[Feature, Address]]: | ||||||
for file_handler in FILE_HANDLERS: | ||||||
for feature, address in file_handler(pe): | ||||||
for feature, address in file_handler(file_ctx=file_ctx): | ||||||
yield feature, address | ||||||
|
||||||
|
||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -191,7 +191,7 @@ def extract_insn_string_features(fh: FunctionHandle, bh, ih: InsnHandle) -> Iter | |||||
if user_string is None: | ||||||
return | ||||||
|
||||||
if len(user_string) >= 4: | ||||||
if len(user_string) >= fh.ctx["min_len"]: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
yield String(user_string), ih.address | ||||||
|
||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,16 +18,18 @@ | |
import capa.features.extractors.ida.basicblock | ||
from capa.features.common import Feature | ||
from capa.features.address import Address, AbsoluteVirtualAddress | ||
from capa.features.extractors.strings import DEFAULT_STRING_LENGTH | ||
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor | ||
|
||
|
||
class IdaFeatureExtractor(FeatureExtractor): | ||
def __init__(self): | ||
def __init__(self, min_len: int = DEFAULT_STRING_LENGTH): | ||
super().__init__() | ||
self.global_features: List[Tuple[Feature, Address]] = [] | ||
self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) | ||
self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) | ||
self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) | ||
self.min_len = min_len | ||
|
||
def get_base_address(self): | ||
return AbsoluteVirtualAddress(idaapi.get_imagebase()) | ||
|
@@ -36,18 +38,17 @@ def extract_global_features(self): | |
yield from self.global_features | ||
|
||
def extract_file_features(self): | ||
yield from capa.features.extractors.ida.file.extract_features() | ||
yield from capa.features.extractors.ida.file.extract_features(file_ctx={"min_len": self.min_len}) | ||
|
||
def get_functions(self) -> Iterator[FunctionHandle]: | ||
import capa.features.extractors.ida.helpers as ida_helpers | ||
|
||
# ignore library functions and thunk functions as identified by IDA | ||
yield from ida_helpers.get_functions(skip_thunks=True, skip_libs=True) | ||
|
||
@staticmethod | ||
def get_function(ea: int) -> FunctionHandle: | ||
def get_function(self, ea: int) -> FunctionHandle: | ||
f = idaapi.get_func(ea) | ||
return FunctionHandle(address=AbsoluteVirtualAddress(f.start_ea), inner=f) | ||
return FunctionHandle(address=AbsoluteVirtualAddress(f.start_ea), inner=f, ctx={"min_len": self.min_len}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good change |
||
|
||
def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: | ||
yield from capa.features.extractors.ida.function.extract_features(fh) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
great