Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM: add support for arm architecture #1803

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/pyinstaller/hooks/hook-vivisect.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
"vivisect.analysis.amd64",
"vivisect.analysis.amd64.emulation",
"vivisect.analysis.amd64.golang",
"vivisect.analysis.arm",
"vivisect.analysis.arm.emulation",
"vivisect.analysis.arm.renaming",
"vivisect.analysis.arm.thunk_reg",
"vivisect.analysis.crypto",
"vivisect.analysis.crypto.constants",
"vivisect.analysis.elf",
Expand Down Expand Up @@ -76,6 +80,7 @@
"vivisect.analysis.ms.vftables",
"vivisect.analysis.pe",
"vivisect.impapi.posix.amd64",
"vivisect.impapi.posix.arm",
"vivisect.impapi.posix.i386",
"vivisect.impapi.windows",
"vivisect.impapi.windows.advapi_32",
Expand Down
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
### New Features
- ghidra: add Ghidra feature extractor and supporting code #1770 @colton-gabertan
- ghidra: add entry script helping users run capa against a loaded Ghidra database #1767 @mike-hunhoff
- binja: add support for forwarded exports #1646 @xusheng6
binja: add support for forwarded exports #1646 @xusheng6
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
binja: add support for forwarded exports #1646 @xusheng6
- binja: add support for forwarded exports #1646 @xusheng6

- binja: add support for symtab names #1504 @xusheng6
- ARM: add support for ARM architecture #1774 @Gatewatcher
- ELF: improve ELF stripper
- ELF: improve statically linked ELF files analysis

### Breaking Changes

Expand Down
3 changes: 2 additions & 1 deletion capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,10 @@ def get_value_str(self):
# other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types
ARCH_I386 = "i386"
ARCH_AMD64 = "amd64"
ARCH_ARM = "ARM"
# dotnet
ARCH_ANY = "any"
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY)
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ARM, ARCH_ANY)


class Arch(Feature):
Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/elf.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ def detect_elf_os(f) -> str:
elif symtab_guess:
ret = symtab_guess

return ret.value if ret is not None else "unknown"
return ret.value if ret is not None else "linux"


def detect_elf_arch(f: BinaryIO) -> str:
Expand Down
2 changes: 2 additions & 0 deletions capa/features/extractors/elffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ def extract_file_arch(elf: ELFFile, **kwargs):
yield Arch("i386"), NO_ADDRESS
elif arch == "x64":
yield Arch("amd64"), NO_ADDRESS
elif arch == "ARM":
yield Arch("ARM"), NO_ADDRESS
else:
logger.warning("unsupported architecture: %s", arch)

Expand Down
20 changes: 13 additions & 7 deletions capa/features/extractors/viv/basicblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

import string
import struct
from typing import Tuple, Iterator
from typing import Tuple, Union, Iterator

import envi
import envi.archs.arm.disasm
import envi.archs.i386.disasm

from capa.features.common import Feature, Characteristic
Expand Down Expand Up @@ -76,7 +77,7 @@ def extract_stackstring(f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Featu
yield Characteristic("stack string"), bb.address


def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool:
def is_mov_imm_to_stack(instr: Union[envi.archs.i386.disasm.i386Opcode, envi.archs.arm.disasm.ArmOpcode]) -> bool:
"""
Return if instruction moves immediate onto stack
"""
Expand All @@ -92,22 +93,27 @@ def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool:
if not src.isImmed():
return False

if not isinstance(dst, envi.archs.i386.disasm.i386SibOper) and not isinstance(
dst, envi.archs.i386.disasm.i386RegMemOper
if (
not isinstance(dst, envi.archs.i386.disasm.i386SibOper)
and not isinstance(dst, envi.archs.i386.disasm.i386RegMemOper)
Comment on lines +97 to +98
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
not isinstance(dst, envi.archs.i386.disasm.i386SibOper)
and not isinstance(dst, envi.archs.i386.disasm.i386RegMemOper)
not isinstance(dst, (envi.archs.i386.disasm.i386SibOper, envi.archs.i386.disasm.i386RegMemOper))

and not isinstance(dst, envi.archs.arm.disasm.ArmRegOper)
):
return False

if not dst.reg:
return False

rname = dst._dis_regctx.getRegisterName(dst.reg)
if rname not in ["ebp", "rbp", "esp", "rsp"]:
if isinstance(dst, (envi.archs.i386.disasm.i386SibOper, envi.archs.i386.disasm.i386RegMemOper)):
rname = dst._dis_regctx.getRegisterName(dst.reg)
else:
rname = dst.reg
if rname not in ["ebp", "rbp", "esp", "rsp", envi.archs.arm.disasm.REG_SP, envi.archs.arm.disasm.REG_BP]:
Comment on lines +106 to +110
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this be simplified to just use the constant or the name?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we import envi.archs.arm.disasm.REG_SP as REG_SP and just use REG_SP for example ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, exactly

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mr-tz i'm not sure i agree, because then it won't be obvious to a reader if the SP/BP are related to arm vs x86 vs ...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant to use:

  • envi.archs.i386.disasm.REG_EBP (or similar [untested])
  • envi.archs.amd64.disasm.REG_RBP (or similar [untested])
  • etc.

Instead of "ebp", "rbp", etc.

return False

return True


def get_printable_len(oper: envi.archs.i386.disasm.i386ImmOper) -> int:
def get_printable_len(oper: Union[envi.archs.i386.disasm.i386ImmOper, envi.archs.arm.disasm.ArmImmOper]) -> int:
"""
Return string length if all operand bytes are ascii or utf16-le printable
"""
Expand Down
9 changes: 7 additions & 2 deletions capa/features/extractors/viv/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import capa.features.extractors.viv.insn
import capa.features.extractors.viv.global_
import capa.features.extractors.viv.function
import capa.features.extractors.viv.insn_arm
import capa.features.extractors.viv.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
Expand All @@ -26,10 +27,11 @@


class VivisectFeatureExtractor(FeatureExtractor):
def __init__(self, vw, path: Path, os):
def __init__(self, vw, path: Path, os, arm=False):
super().__init__()
self.vw = vw
self.path = path
self.arm = arm
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest to use arch here instead

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. I will change it

self.buf = path.read_bytes()

# pre-compute these because we'll yield them at *every* scope.
Expand Down Expand Up @@ -74,7 +76,10 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa
def extract_insn_features(
self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih)
if self.arm:
yield from capa.features.extractors.viv.insn_arm.extract_features(fh, bbh, ih)
else:
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih)

def is_library_function(self, addr):
return viv_utils.flirt.is_library_function(self.vw, addr)
Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/viv/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def extract_function_loop(fhandle: FunctionHandle) -> Iterator[Tuple[Feature, Ad
bflags & envi.BR_COND
or bflags & envi.BR_FALL
or bflags & envi.BR_TABLE
or bb.instructions[-1].mnem == "jmp"
or bb.instructions[-1].mnem in ["jmp", "b", "bx"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
or bb.instructions[-1].mnem in ["jmp", "b", "bx"]
or bb.instructions[-1].mnem in ("jmp", "b", "bx")

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little worried about mixing x86/amd64 and ARM throughout, so maybe we can add comments or separate them at least in some way for better maintainability?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree about mixing the logic, here and in other comments.

in general, i'd prefer to break out functions with human-readable names, like is_jump and is_jump_x86/is_jump_arm but i also recognize the overhead of making all these changes. please let us know what you're comfortable doing and what is reasonable for us to contribute, too.

):
edges.append((bb.va, bva))

Expand Down
5 changes: 4 additions & 1 deletion capa/features/extractors/viv/global_.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import logging
from typing import Tuple, Iterator

from capa.features.common import ARCH_I386, ARCH_AMD64, Arch, Feature
from capa.features.common import ARCH_ARM, ARCH_I386, ARCH_AMD64, Arch, Feature
from capa.features.address import NO_ADDRESS, Address

logger = logging.getLogger(__name__)
Expand All @@ -22,6 +22,9 @@ def extract_arch(vw) -> Iterator[Tuple[Feature, Address]]:
elif arch == "i386":
yield Arch(ARCH_I386), NO_ADDRESS

elif arch == "ARM":
yield Arch(ARCH_ARM), NO_ADDRESS

else:
# we likely end up here:
# 1. handling a new architecture (e.g. aarch64)
Expand Down
17 changes: 17 additions & 0 deletions capa/features/extractors/viv/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# See the License for the specific language governing permissions and limitations under the License.
from typing import Optional

import envi
from vivisect import VivWorkspace
from vivisect.const import XR_TO, REF_CODE

Expand All @@ -21,3 +22,19 @@ def get_coderef_from(vw: VivWorkspace, va: int) -> Optional[int]:
return xrefs[0][XR_TO]
else:
return None


def read_memory(vw, va: int, size: int) -> bytes:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good moving this here

# as documented in #176, vivisect will not readMemory() when the section is not marked readable.
#
# but here, we don't care about permissions.
# so, copy the viv implementation of readMemory and remove the permissions check.
#
# this is derived from:
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462
for mva, mmaxva, mmap, mbytes in vw._map_defs:
if va >= mva and va < mmaxva:
mva, msize, mperms, mfname = mmap
offset = va - mva
return mbytes[offset : offset + size]
raise envi.exc.SegmentationViolation(va)
58 changes: 53 additions & 5 deletions capa/features/extractors/viv/indirect_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import envi
import vivisect.const
import envi.archs.arm.disasm
import envi.archs.i386.disasm
import envi.archs.amd64.disasm
from vivisect import VivWorkspace
Expand All @@ -20,12 +21,15 @@
i386ImmOper = envi.archs.i386.disasm.i386ImmOper
i386ImmMemOper = envi.archs.i386.disasm.i386ImmMemOper
Amd64RipRelOper = envi.archs.amd64.disasm.Amd64RipRelOper
ARMRegOper = envi.archs.arm.disasm.ArmRegOper
ARMImmOper = envi.archs.arm.disasm.ArmImmOper
ARMScaledOffsetOper = envi.archs.arm.disasm.ArmScaledOffsetOper
LOC_OP = vivisect.const.LOC_OP
IF_NOFALL = envi.IF_NOFALL
REF_CODE = vivisect.const.REF_CODE
FAR_BRANCH_MASK = envi.BR_PROC | envi.BR_DEREF | envi.BR_ARCH

DESTRUCTIVE_MNEMONICS = ("mov", "lea", "pop", "xor")
DESTRUCTIVE_MNEMONICS = ("mov", "lea", "ldr", "pop", "xor", "eor")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
DESTRUCTIVE_MNEMONICS = ("mov", "lea", "ldr", "pop", "xor", "eor")
DESTRUCTIVE_MNEMONICS = (
# x86
"mov", "lea", "pop", "xor",
# arm
"ldr", "eor")

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see comment above on mixing archs, something like this could help

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, great idea, I'll try to apply this where I can. I think it is only necessary for mnemonics and not for constants with arm in their names but it's up to you.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even further, we could use sets like:

DESTRUCTIVE_MNEMONICS_X86 = {
    "mov", "lea", "pop", "xor",
}
DESTRUCTIVE_MNEMONICS_ARM = {
    "ldr", "eor",
}
DESTRUCTIVE_MNEMONICS = DESTRUCTIVE_MNEMONICS_X86 | DESTRUCTIVE_MNEMONICS_ARM



def get_previous_instructions(vw: VivWorkspace, va: int) -> List[int]:
Expand Down Expand Up @@ -71,6 +75,38 @@ class NotFoundError(Exception):
pass


def find_value(vw: VivWorkspace, va: int, reg: int, q):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment detailing what is achieved here, please?

tmp = 0
seen = set([]) # type: Set[int]

q.extend(get_previous_instructions(vw, va))
while q:
cur = q.popleft()
if cur in seen:
continue
seen.add(cur)
insn = vw.parseOpcode(cur)
if len(insn.opers) == 0:
q.extend(get_previous_instructions(vw, cur))
continue

opnd0 = insn.opers[0]
if not (isinstance(opnd0, ARMRegOper) and opnd0.reg == reg):
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "sub" and isinstance(insn.opers[1], ARMImmOper):
tmp -= insn.opers[1].val
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "add" and isinstance(insn.opers[1], ARMImmOper):
tmp += insn.opers[1].val
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "mov" and isinstance(insn.opers[1], ARMImmOper):
return insn.opers[1].val + tmp
return None


def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[int]]:
"""
scan backwards from the given address looking for assignments to the given register.
Expand Down Expand Up @@ -106,7 +142,9 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[
continue

opnd0 = insn.opers[0]
if not (isinstance(opnd0, i386RegOper) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS):
if not (
isinstance(opnd0, (i386RegOper, ARMRegOper)) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS
):
q.extend(get_previous_instructions(vw, cur))
continue

Expand All @@ -115,16 +153,24 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[
# we currently only support extracting the constant from something like: `mov $reg, IAT`
# so, any other pattern results in an unknown value, represented by None.
# this is a good place to extend in the future, if we need more robust support.
if insn.mnem != "mov":
if insn.mnem not in ("mov", "ldr"):
return (cur, None)
else:
opnd1 = insn.opers[1]
if isinstance(opnd1, i386ImmOper):
if isinstance(opnd1, (i386ImmOper, ARMImmOper)):
Comment on lines +156 to +160
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, for maintainability I'd suggest to add comments or split them up into separate if lines

return (cur, opnd1.getOperValue(opnd1))
elif isinstance(opnd1, i386ImmMemOper):
return (cur, opnd1.getOperAddr(opnd1))
elif isinstance(opnd1, Amd64RipRelOper):
return (cur, opnd1.getOperAddr(insn))
elif isinstance(opnd1, ARMScaledOffsetOper):
base_reg = find_value(vw, cur, opnd1.base_reg, q)
if base_reg is None:
return (cur, None)
offset_reg = find_value(vw, cur, opnd1.offset_reg, q)
if offset_reg is None:
return (cur, None)
return (cur, base_reg + offset_reg)
else:
# might be something like: `mov $reg, dword_401000[eax]`
return (cur, None)
Expand All @@ -136,7 +182,9 @@ def is_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> bool:
if insn is None:
insn = vw.parseOpcode(va)

return insn.mnem in ("call", "jmp") and isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper)
return insn.mnem in ("call", "jmp", "bl", "blx", "b", "bx") and isinstance(
insn.opers[0], (envi.archs.i386.disasm.i386RegOper, envi.archs.arm.disasm.ArmRegOper)
)
Comment on lines +185 to +187
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split up by arch



def resolve_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> Tuple[int, Optional[int]]:
Expand Down
27 changes: 11 additions & 16 deletions capa/features/extractors/viv/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.elf import SymTab
from capa.features.extractors.viv.helpers import read_memory
from capa.features.extractors.viv.syscall import get_library_function_name
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_indirect_call

Expand Down Expand Up @@ -81,6 +83,15 @@ def extract_insn_api_features(fh: FunctionHandle, bb, ih: InsnHandle) -> Iterato
if f.vw.getFunctionMeta(f.va, "Thunk"):
return

# Added a case for catching basic blocks that contain direct calls to system functions.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

neat addition!

if insn.mnem in ("int", "syscall"):
if insn.mnem != "int" or insn.opers[0].imm == 128:
name = get_library_function_name(f.vw, bb)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pass the instruction here instead of bb

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this called per instruction, but then enumerates the full basic block again and again?

if name is None:
return
yield API(name), ih.address
return

# traditional call via IAT
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
oper = insn.opers[0]
Expand Down Expand Up @@ -222,22 +233,6 @@ def derefs(vw, p):
p = next


def read_memory(vw, va: int, size: int) -> bytes:
# as documented in #176, vivisect will not readMemory() when the section is not marked readable.
#
# but here, we don't care about permissions.
# so, copy the viv implementation of readMemory and remove the permissions check.
#
# this is derived from:
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462
for mva, mmaxva, mmap, mbytes in vw._map_defs:
if va >= mva and va < mmaxva:
mva, msize, mperms, mfname = mmap
offset = va - mva
return mbytes[offset : offset + size]
raise envi.exc.SegmentationViolation(va)


def read_bytes(vw, va: int) -> bytes:
"""
read up to MAX_BYTES_FEATURE_SIZE from the given address.
Expand Down
Loading
Loading