-
Notifications
You must be signed in to change notification settings - Fork 567
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARM: add support for arm architecture #1803
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -8,9 +8,10 @@ | |||||||
|
||||||||
import string | ||||||||
import struct | ||||||||
from typing import Tuple, Iterator | ||||||||
from typing import Tuple, Union, Iterator | ||||||||
|
||||||||
import envi | ||||||||
import envi.archs.arm.disasm | ||||||||
import envi.archs.i386.disasm | ||||||||
|
||||||||
from capa.features.common import Feature, Characteristic | ||||||||
|
@@ -76,7 +77,7 @@ def extract_stackstring(f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Featu | |||||||
yield Characteristic("stack string"), bb.address | ||||||||
|
||||||||
|
||||||||
def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool: | ||||||||
def is_mov_imm_to_stack(instr: Union[envi.archs.i386.disasm.i386Opcode, envi.archs.arm.disasm.ArmOpcode]) -> bool: | ||||||||
""" | ||||||||
Return if instruction moves immediate onto stack | ||||||||
""" | ||||||||
|
@@ -92,22 +93,27 @@ def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool: | |||||||
if not src.isImmed(): | ||||||||
return False | ||||||||
|
||||||||
if not isinstance(dst, envi.archs.i386.disasm.i386SibOper) and not isinstance( | ||||||||
dst, envi.archs.i386.disasm.i386RegMemOper | ||||||||
if ( | ||||||||
not isinstance(dst, envi.archs.i386.disasm.i386SibOper) | ||||||||
and not isinstance(dst, envi.archs.i386.disasm.i386RegMemOper) | ||||||||
Comment on lines
+97
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
and not isinstance(dst, envi.archs.arm.disasm.ArmRegOper) | ||||||||
): | ||||||||
return False | ||||||||
|
||||||||
if not dst.reg: | ||||||||
return False | ||||||||
|
||||||||
rname = dst._dis_regctx.getRegisterName(dst.reg) | ||||||||
if rname not in ["ebp", "rbp", "esp", "rsp"]: | ||||||||
if isinstance(dst, (envi.archs.i386.disasm.i386SibOper, envi.archs.i386.disasm.i386RegMemOper)): | ||||||||
rname = dst._dis_regctx.getRegisterName(dst.reg) | ||||||||
else: | ||||||||
rname = dst.reg | ||||||||
if rname not in ["ebp", "rbp", "esp", "rsp", envi.archs.arm.disasm.REG_SP, envi.archs.arm.disasm.REG_BP]: | ||||||||
Comment on lines
+106
to
+110
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this be simplified to just use the constant or the name? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we import envi.archs.arm.disasm.REG_SP as REG_SP and just use REG_SP for example ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, exactly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mr-tz i'm not sure i agree, because then it won't be obvious to a reader if the SP/BP are related to arm vs x86 vs ... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant to use:
Instead of "ebp", "rbp", etc. |
||||||||
return False | ||||||||
|
||||||||
return True | ||||||||
|
||||||||
|
||||||||
def get_printable_len(oper: envi.archs.i386.disasm.i386ImmOper) -> int: | ||||||||
def get_printable_len(oper: Union[envi.archs.i386.disasm.i386ImmOper, envi.archs.arm.disasm.ArmImmOper]) -> int: | ||||||||
""" | ||||||||
Return string length if all operand bytes are ascii or utf16-le printable | ||||||||
""" | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
import capa.features.extractors.viv.insn | ||
import capa.features.extractors.viv.global_ | ||
import capa.features.extractors.viv.function | ||
import capa.features.extractors.viv.insn_arm | ||
import capa.features.extractors.viv.basicblock | ||
from capa.features.common import Feature | ||
from capa.features.address import Address, AbsoluteVirtualAddress | ||
|
@@ -26,10 +27,11 @@ | |
|
||
|
||
class VivisectFeatureExtractor(FeatureExtractor): | ||
def __init__(self, vw, path: Path, os): | ||
def __init__(self, vw, path: Path, os, arm=False): | ||
super().__init__() | ||
self.vw = vw | ||
self.path = path | ||
self.arm = arm | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. I will change it |
||
self.buf = path.read_bytes() | ||
|
||
# pre-compute these because we'll yield them at *every* scope. | ||
|
@@ -74,7 +76,10 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa | |
def extract_insn_features( | ||
self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle | ||
) -> Iterator[Tuple[Feature, Address]]: | ||
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih) | ||
if self.arm: | ||
yield from capa.features.extractors.viv.insn_arm.extract_features(fh, bbh, ih) | ||
else: | ||
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih) | ||
|
||
def is_library_function(self, addr): | ||
return viv_utils.flirt.is_library_function(self.vw, addr) | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -80,7 +80,7 @@ def extract_function_loop(fhandle: FunctionHandle) -> Iterator[Tuple[Feature, Ad | |||||
bflags & envi.BR_COND | ||||||
or bflags & envi.BR_FALL | ||||||
or bflags & envi.BR_TABLE | ||||||
or bb.instructions[-1].mnem == "jmp" | ||||||
or bb.instructions[-1].mnem in ["jmp", "b", "bx"] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a little worried about mixing x86/amd64 and ARM throughout, so maybe we can add comments or separate them at least in some way for better maintainability? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agree about mixing the logic, here and in other comments. in general, i'd prefer to break out functions with human-readable names, like |
||||||
): | ||||||
edges.append((bb.va, bva)) | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
# See the License for the specific language governing permissions and limitations under the License. | ||
from typing import Optional | ||
|
||
import envi | ||
from vivisect import VivWorkspace | ||
from vivisect.const import XR_TO, REF_CODE | ||
|
||
|
@@ -21,3 +22,19 @@ def get_coderef_from(vw: VivWorkspace, va: int) -> Optional[int]: | |
return xrefs[0][XR_TO] | ||
else: | ||
return None | ||
|
||
|
||
def read_memory(vw, va: int, size: int) -> bytes: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good moving this here |
||
# as documented in #176, vivisect will not readMemory() when the section is not marked readable. | ||
# | ||
# but here, we don't care about permissions. | ||
# so, copy the viv implementation of readMemory and remove the permissions check. | ||
# | ||
# this is derived from: | ||
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462 | ||
for mva, mmaxva, mmap, mbytes in vw._map_defs: | ||
if va >= mva and va < mmaxva: | ||
mva, msize, mperms, mfname = mmap | ||
offset = va - mva | ||
return mbytes[offset : offset + size] | ||
raise envi.exc.SegmentationViolation(va) |
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -11,6 +11,7 @@ | |||||||||||||
|
||||||||||||||
import envi | ||||||||||||||
import vivisect.const | ||||||||||||||
import envi.archs.arm.disasm | ||||||||||||||
import envi.archs.i386.disasm | ||||||||||||||
import envi.archs.amd64.disasm | ||||||||||||||
from vivisect import VivWorkspace | ||||||||||||||
|
@@ -20,12 +21,15 @@ | |||||||||||||
i386ImmOper = envi.archs.i386.disasm.i386ImmOper | ||||||||||||||
i386ImmMemOper = envi.archs.i386.disasm.i386ImmMemOper | ||||||||||||||
Amd64RipRelOper = envi.archs.amd64.disasm.Amd64RipRelOper | ||||||||||||||
ARMRegOper = envi.archs.arm.disasm.ArmRegOper | ||||||||||||||
ARMImmOper = envi.archs.arm.disasm.ArmImmOper | ||||||||||||||
ARMScaledOffsetOper = envi.archs.arm.disasm.ArmScaledOffsetOper | ||||||||||||||
LOC_OP = vivisect.const.LOC_OP | ||||||||||||||
IF_NOFALL = envi.IF_NOFALL | ||||||||||||||
REF_CODE = vivisect.const.REF_CODE | ||||||||||||||
FAR_BRANCH_MASK = envi.BR_PROC | envi.BR_DEREF | envi.BR_ARCH | ||||||||||||||
|
||||||||||||||
DESTRUCTIVE_MNEMONICS = ("mov", "lea", "pop", "xor") | ||||||||||||||
DESTRUCTIVE_MNEMONICS = ("mov", "lea", "ldr", "pop", "xor", "eor") | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see comment above on mixing archs, something like this could help There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, great idea, I'll try to apply this where I can. I think it is only necessary for mnemonics and not for constants with arm in their names but it's up to you. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. even further, we could use sets like: DESTRUCTIVE_MNEMONICS_X86 = {
"mov", "lea", "pop", "xor",
}
DESTRUCTIVE_MNEMONICS_ARM = {
"ldr", "eor",
}
DESTRUCTIVE_MNEMONICS = DESTRUCTIVE_MNEMONICS_X86 | DESTRUCTIVE_MNEMONICS_ARM |
||||||||||||||
|
||||||||||||||
|
||||||||||||||
def get_previous_instructions(vw: VivWorkspace, va: int) -> List[int]: | ||||||||||||||
|
@@ -71,6 +75,38 @@ class NotFoundError(Exception): | |||||||||||||
pass | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
def find_value(vw: VivWorkspace, va: int, reg: int, q): | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a comment detailing what is achieved here, please? |
||||||||||||||
tmp = 0 | ||||||||||||||
seen = set([]) # type: Set[int] | ||||||||||||||
|
||||||||||||||
q.extend(get_previous_instructions(vw, va)) | ||||||||||||||
while q: | ||||||||||||||
cur = q.popleft() | ||||||||||||||
if cur in seen: | ||||||||||||||
continue | ||||||||||||||
seen.add(cur) | ||||||||||||||
insn = vw.parseOpcode(cur) | ||||||||||||||
if len(insn.opers) == 0: | ||||||||||||||
q.extend(get_previous_instructions(vw, cur)) | ||||||||||||||
continue | ||||||||||||||
|
||||||||||||||
opnd0 = insn.opers[0] | ||||||||||||||
if not (isinstance(opnd0, ARMRegOper) and opnd0.reg == reg): | ||||||||||||||
q.extend(get_previous_instructions(vw, cur)) | ||||||||||||||
continue | ||||||||||||||
if insn.mnem == "sub" and isinstance(insn.opers[1], ARMImmOper): | ||||||||||||||
tmp -= insn.opers[1].val | ||||||||||||||
q.extend(get_previous_instructions(vw, cur)) | ||||||||||||||
continue | ||||||||||||||
if insn.mnem == "add" and isinstance(insn.opers[1], ARMImmOper): | ||||||||||||||
tmp += insn.opers[1].val | ||||||||||||||
q.extend(get_previous_instructions(vw, cur)) | ||||||||||||||
continue | ||||||||||||||
if insn.mnem == "mov" and isinstance(insn.opers[1], ARMImmOper): | ||||||||||||||
return insn.opers[1].val + tmp | ||||||||||||||
return None | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[int]]: | ||||||||||||||
""" | ||||||||||||||
scan backwards from the given address looking for assignments to the given register. | ||||||||||||||
|
@@ -106,7 +142,9 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[ | |||||||||||||
continue | ||||||||||||||
|
||||||||||||||
opnd0 = insn.opers[0] | ||||||||||||||
if not (isinstance(opnd0, i386RegOper) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS): | ||||||||||||||
if not ( | ||||||||||||||
isinstance(opnd0, (i386RegOper, ARMRegOper)) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS | ||||||||||||||
): | ||||||||||||||
q.extend(get_previous_instructions(vw, cur)) | ||||||||||||||
continue | ||||||||||||||
|
||||||||||||||
|
@@ -115,16 +153,24 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[ | |||||||||||||
# we currently only support extracting the constant from something like: `mov $reg, IAT` | ||||||||||||||
# so, any other pattern results in an unknown value, represented by None. | ||||||||||||||
# this is a good place to extend in the future, if we need more robust support. | ||||||||||||||
if insn.mnem != "mov": | ||||||||||||||
if insn.mnem not in ("mov", "ldr"): | ||||||||||||||
return (cur, None) | ||||||||||||||
else: | ||||||||||||||
opnd1 = insn.opers[1] | ||||||||||||||
if isinstance(opnd1, i386ImmOper): | ||||||||||||||
if isinstance(opnd1, (i386ImmOper, ARMImmOper)): | ||||||||||||||
Comment on lines
+156
to
+160
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, for maintainability I'd suggest to add comments or split them up into separate if lines |
||||||||||||||
return (cur, opnd1.getOperValue(opnd1)) | ||||||||||||||
elif isinstance(opnd1, i386ImmMemOper): | ||||||||||||||
return (cur, opnd1.getOperAddr(opnd1)) | ||||||||||||||
elif isinstance(opnd1, Amd64RipRelOper): | ||||||||||||||
return (cur, opnd1.getOperAddr(insn)) | ||||||||||||||
elif isinstance(opnd1, ARMScaledOffsetOper): | ||||||||||||||
base_reg = find_value(vw, cur, opnd1.base_reg, q) | ||||||||||||||
if base_reg is None: | ||||||||||||||
return (cur, None) | ||||||||||||||
offset_reg = find_value(vw, cur, opnd1.offset_reg, q) | ||||||||||||||
if offset_reg is None: | ||||||||||||||
return (cur, None) | ||||||||||||||
return (cur, base_reg + offset_reg) | ||||||||||||||
else: | ||||||||||||||
# might be something like: `mov $reg, dword_401000[eax]` | ||||||||||||||
return (cur, None) | ||||||||||||||
|
@@ -136,7 +182,9 @@ def is_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> bool: | |||||||||||||
if insn is None: | ||||||||||||||
insn = vw.parseOpcode(va) | ||||||||||||||
|
||||||||||||||
return insn.mnem in ("call", "jmp") and isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper) | ||||||||||||||
return insn.mnem in ("call", "jmp", "bl", "blx", "b", "bx") and isinstance( | ||||||||||||||
insn.opers[0], (envi.archs.i386.disasm.i386RegOper, envi.archs.arm.disasm.ArmRegOper) | ||||||||||||||
) | ||||||||||||||
Comment on lines
+185
to
+187
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. split up by arch |
||||||||||||||
|
||||||||||||||
|
||||||||||||||
def resolve_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> Tuple[int, Optional[int]]: | ||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,8 @@ | |
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic | ||
from capa.features.address import Address, AbsoluteVirtualAddress | ||
from capa.features.extractors.elf import SymTab | ||
from capa.features.extractors.viv.helpers import read_memory | ||
from capa.features.extractors.viv.syscall import get_library_function_name | ||
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle | ||
from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_indirect_call | ||
|
||
|
@@ -81,6 +83,15 @@ def extract_insn_api_features(fh: FunctionHandle, bb, ih: InsnHandle) -> Iterato | |
if f.vw.getFunctionMeta(f.va, "Thunk"): | ||
return | ||
|
||
# Added a case for catching basic blocks that contain direct calls to system functions. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. neat addition! |
||
if insn.mnem in ("int", "syscall"): | ||
if insn.mnem != "int" or insn.opers[0].imm == 128: | ||
name = get_library_function_name(f.vw, bb) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pass the instruction here instead of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this called per instruction, but then enumerates the full basic block again and again? |
||
if name is None: | ||
return | ||
yield API(name), ih.address | ||
return | ||
|
||
# traditional call via IAT | ||
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper): | ||
oper = insn.opers[0] | ||
|
@@ -222,22 +233,6 @@ def derefs(vw, p): | |
p = next | ||
|
||
|
||
def read_memory(vw, va: int, size: int) -> bytes: | ||
# as documented in #176, vivisect will not readMemory() when the section is not marked readable. | ||
# | ||
# but here, we don't care about permissions. | ||
# so, copy the viv implementation of readMemory and remove the permissions check. | ||
# | ||
# this is derived from: | ||
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462 | ||
for mva, mmaxva, mmap, mbytes in vw._map_defs: | ||
if va >= mva and va < mmaxva: | ||
mva, msize, mperms, mfname = mmap | ||
offset = va - mva | ||
return mbytes[offset : offset + size] | ||
raise envi.exc.SegmentationViolation(va) | ||
|
||
|
||
def read_bytes(vw, va: int) -> bytes: | ||
""" | ||
read up to MAX_BYTES_FEATURE_SIZE from the given address. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.