From 577577b08344e84308da0dc3c6990348fecee871 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 29 Aug 2024 11:13:51 +0000 Subject: [PATCH 1/7] initial add --- tests/test_binexport_accessors.py | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 tests/test_binexport_accessors.py diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py new file mode 100644 index 000000000..357623a3b --- /dev/null +++ b/tests/test_binexport_accessors.py @@ -0,0 +1,82 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging +from typing import Any, Dict + +from google.protobuf.json_format import ParseDict + +from capa.features.extractors.binexport2.helpers import ( + get_operand_expressions, + get_instruction_mnemonic, + get_instruction_operands, +) +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +logger = logging.getLogger(__name__) + +""" +mov x0, 0x20 +bl 0x100 +add x0, sp, 0x10 + +# not here yet ldr x0, [x1, 8] +""" + +BE2_DICT: Dict[str, Any] = { + "expression": [ + {"type": 1, "symbol": "x0"}, + {"type": 2, "immediate": 0x20}, + {"type": 3, "immediate": 0x100}, + {"type": 1, "symbol": "sp"}, + {"type": 3, "immediate": 0x10}, + ], + # operand consists of 1 or more expressions, linked together as a tree + "operand": [ + {"expression_index": [0]}, + {"expression_index": [1]}, + {"expression_index": [2]}, + {"expression_index": [3]}, + {"expression_index": [4]}, + ], + "mnemonic": [ + {"name": "mov"}, # mnem 0 + {"name": "bl"}, # mnem 1 + {"name": "add"}, # mnem 2 + ], + # instruction may have 0 or more operands + "instruction": [ + {"mnemonic_index": 0, "operand_index": [0, 1]}, + {"mnemonic_index": 1, "operand_index": [2]}, + {"mnemonic_index": 2, "operand_index": [0, 3, 4]}, + ], +} +BE2 = ParseDict( + BE2_DICT, + BinExport2(), +) + + +def test_get_instruction_mnemonic(): + mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction()) + call = ParseDict(BE2_DICT["instruction"][1], BinExport2.Instruction()) + + assert get_instruction_mnemonic(BE2, mov) == "mov" + assert get_instruction_mnemonic(BE2, call) == "bl" + + +def test_get_instruction_operands(): + insn = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction()) + + assert len(get_instruction_operands(BE2, insn)) == 3 + + +def test_get_operand_expressions(): + oper = ParseDict(BE2_DICT["operand"][0], BinExport2.Operand()) + + assert len(get_operand_expressions(BE2, oper)) == 1 From 5fd16c8bd3f2b098dc3225da6c277cc7b85c04f5 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 29 Aug 2024 12:02:59 +0000 Subject: [PATCH 2/7] test binexport scripts --- tests/test_scripts.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 35bf5347f..5735143cf 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -6,6 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import os import sys import logging import textwrap @@ -38,6 +39,10 @@ def get_report_file_path(): ) +def get_binexport2_file_path(): + return str(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport") + + def get_rules_path(): return str(CD / ".." / "rules") @@ -74,6 +79,22 @@ def test_scripts(script, args): assert p.returncode == 0 +@pytest.mark.parametrize( + "script,args", + [ + pytest.param("inspect-binexport2.py", [get_binexport2_file_path()]), + pytest.param("detect-binexport2-capabilities.py", [get_binexport2_file_path()]), + ], +) +def test_binexport_scripts(script, args): + # define sample bytes location + os.environ["CAPA_SAMPLES_DIR"] = str(Path(CD / "data")) + + script_path = get_script_path(script) + p = run_program(script_path, args) + assert p.returncode == 0 + + def test_bulk_process(tmp_path): # create test directory to recursively analyze t = tmp_path / "test" From b1211d91de7e6f1fa33ca7700e89e89b408190f1 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Tue, 3 Sep 2024 09:49:43 +0000 Subject: [PATCH 3/7] add tests using small ARM ELF --- tests/test_binexport_accessors.py | 254 +++++++++++++++++++++++++++--- 1 file changed, 236 insertions(+), 18 deletions(-) diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py index 357623a3b..cef655eb0 100644 --- a/tests/test_binexport_accessors.py +++ b/tests/test_binexport_accessors.py @@ -6,28 +6,247 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import re import logging from typing import Any, Dict +from pathlib import Path +import pytest +import fixtures from google.protobuf.json_format import ParseDict from capa.features.extractors.binexport2.helpers import ( get_operand_expressions, get_instruction_mnemonic, get_instruction_operands, + get_operand_register_expression, + get_operand_immediate_expression, ) from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 +from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression logger = logging.getLogger(__name__) +CD = Path(__file__).resolve().parent + + +# found via https://www.virustotal.com/gui/search/type%253Aelf%2520and%2520size%253A1.2kb%252B%2520and%2520size%253A1.4kb-%2520and%2520tag%253Aarm%2520and%2520not%2520tag%253Arelocatable%2520and%2520tag%253A64bits/files +# Ghidra disassembly of c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486 +GHIDRA_DISASSEMBLY = """ + // + // segment_1 + // Loadable segment [0x200000 - 0x200157] + // ram:00200000-ram:00200157 + // + 00200000 7f 45 4c Elf64_Ehdr +... + // + // .text + // SHT_PROGBITS [0x210158 - 0x2101c7] + // ram:00210158-ram:002101c7 + // + ************************************************************** + * FUNCTION * + ************************************************************** + undefined entry() + undefined w0:1 + _start XREF[4]: Entry Point(*), 00200018(*), + entry 002000c0(*), + _elfSectionHeaders::00000050(*) + 00210158 20 00 80 d2 mov x0,#0x1 + 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 = "Hello World!\n" + = 00000000002201C8h + 00210160 c2 02 00 58 ldr x2,DAT_002101b8 = 000000000000000Eh + 00210164 08 08 80 d2 mov x8,#0x40 + 00210168 01 00 00 d4 svc 0x0 + 0021016c a0 02 00 58 ldr x0=>$stringWith_Weird_Name,DAT_002101c0 = "This string has a very strang + = 00000000002201D6h + 00210170 04 00 00 94 bl printString undefined printString() + 00210174 60 0f 80 d2 mov x0,#0x7b + 00210178 a8 0b 80 d2 mov x8,#0x5d + 0021017c 01 00 00 d4 svc 0x0 + ************************************************************** + * FUNCTION * + ************************************************************** + undefined printString() + undefined w0:1 + printString XREF[1]: entry:00210170(c) + 00210180 01 00 80 d2 mov x1,#0x0 + strlenLoop XREF[1]: 00210194(j) + 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + 00210188 5f 00 00 71 cmp w2,#0x0 + 0021018c 60 00 00 54 b.eq strlenDone + 00210190 21 04 00 91 add x1,x1,#0x1 + 00210194 fc ff ff 17 b strlenLoop + strlenDone XREF[1]: 0021018c(j) + 00210198 e2 03 01 aa mov x2,x1 + 0021019c e1 03 00 aa mov x1,x0 + 002101a0 20 00 80 d2 mov x0,#0x1 + 002101a4 08 08 80 d2 mov x8,#0x40 + 002101a8 01 00 00 d4 svc 0x0 + 002101ac c0 03 5f d6 ret + DAT_002101b0 XREF[1]: entry:0021015c(R) + 002101b0 c8 01 22 undefined8 00000000002201C8h ? -> 002201c8 + 00 00 00 + 00 00 + DAT_002101b8 XREF[1]: entry:00210160(R) + 002101b8 0e 00 00 undefined8 000000000000000Eh + 00 00 00 + 00 00 + DAT_002101c0 XREF[1]: entry:0021016c(R) + 002101c0 d6 01 22 undefined8 00000000002201D6h ? -> 002201d6 + 00 00 00 + 00 00 + // + // .data + // SHT_PROGBITS [0x2201c8 - 0x2201fb] + // ram:002201c8-ram:002201fb + // + helloWorldStr XREF[3]: 002000f8(*), entry:0021015c(*), + _elfSectionHeaders::00000090(*) + 002201c8 48 65 6c ds "Hello World!\n" + 6c 6f 20 + 57 6f 72 + $stringWith_Weird_Name XREF[1]: entry:0021016c(*) + 002201d6 54 68 69 ds "This string has a very strange label\n" + 73 20 73 + 74 72 69 +... +""" + + +def _parse_ghidra_disassembly(disasm: str) -> dict: + dd = {} + # 00210158 20 00 80 d2 mov x0,#0x1 + # ^^^^^^^^ ^^^^^^^^^^^ ^^^ ^^ ^^^^ + # address bytes mnemonic o1,o2 (,o3) + pattern = re.compile( + r"^( ){8}(?P
[0-9a-f]+) " + "(?P([0-9a-f]{2}[ ]){4})\s+" + "(?P[\w\.]+)\s*" + "(?P[\w#$=>]+)?,?" + "((?P[\w#$=>]+))?,?" + "((?P[\w#$=>]+))?" + ) + for line in disasm.splitlines()[20:]: + m = pattern.match(line) + if m: + logger.debug("Match found\t%s\n\t\t\t\t%s", line, m.groupdict()) + dd[int(m["address"], 0x10)] = { + "bytes": m["bytes"].strip(), + "mnemonic": m["mnemonic"], + "operands": [e for e in [m["operand1"], m["operand2"], m["operand3"]] if e is not None], + } + else: + logger.debug("No match\t%s", line) + return dd + + +BE2_EXTRACTOR = fixtures.get_binexport_extractor( + CD + / "data" + / "binexport2" + / "c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486.elf_.ghidra.BinExport" +) +PARSED_DISASM = _parse_ghidra_disassembly(GHIDRA_DISASSEMBLY) + + +def test_instruction_bytes(): + # more a data sanity check here as we don't test our code + for addr, de in PARSED_DISASM.items(): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + assert insn.raw_bytes == bytes.fromhex(de["bytes"]) + + +def test_get_instruction_mnemonic(): + for addr, de in PARSED_DISASM.items(): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + assert get_instruction_mnemonic(BE2_EXTRACTOR.be2, insn) == de["mnemonic"] + + +def test_get_instruction_operands_count(): + for addr, de in PARSED_DISASM.items(): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + # this line is not properly parsed from the Ghidra disassembly using the current regex + # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + if addr == 0x210184: + assert len(ops) == 2 + else: + assert len(ops) == len(de["operands"]) + + +@pytest.mark.parametrize( + "addr,op_expressions", + [ + # 00210158 20 00 80 d2 mov x0,#0x1 + (0x210158, ("x0", "#0x1")), + # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 + (0x21015C, ("x1", "DAT_002101b0")), + # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + (0x210184, ("w2", "[x0, x1, LSL ]")), + # 00210190 21 04 00 91 add x1,x1,#0x1 + (0x210190, ("x1", "x1", "#0x1")), + ], +) +def test_get_operand_expressions(addr, op_expressions): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + for i, op in enumerate(ops): + exps = get_operand_expressions(BE2_EXTRACTOR.be2, op) + assert len(exps) == 1 + assert exps[0].symbol == op_expressions[i] + + +@pytest.mark.parametrize( + "addr,reg_expressions", + [ + # 00210158 20 00 80 d2 mov x0,#0x1 + (0x210158, ("x0", None)), + # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 + (0x21015C, ("x1", None)), + # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + (0x210184, ("w2", None)), + # 00210190 21 04 00 91 add x1,x1,#0x1 + (0x210190, ("x1", "x1", None)), + ], +) +def _TODO_test_get_operand_register_expression(addr, reg_expressions): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + for i, op in enumerate(ops): + reg_exp = get_operand_register_expression(BE2_EXTRACTOR.be2, op) + logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op)) + assert reg_exp == reg_expressions[i] + + +@pytest.mark.parametrize( + "addr,expressions", + [ + # 00210158 20 00 80 d2 mov x0,#0x1 + (0x210158, (None, 0x1)), + # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 + (0x21015C, (None, None)), + # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + (0x210184, (None, None)), + # 00210190 21 04 00 91 add x1,x1,#0x1 + (0x210190, (None, None, 0x1)), + ], +) +def _TODO_test_get_operand_immediate_expression(addr, expressions): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + for i, op in enumerate(ops): + reg_exp = get_operand_immediate_expression(BE2_EXTRACTOR.be2, op) + logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op)) + assert reg_exp == expressions[i] + + """ mov x0, 0x20 bl 0x100 add x0, sp, 0x10 - -# not here yet ldr x0, [x1, 8] """ - BE2_DICT: Dict[str, Any] = { "expression": [ {"type": 1, "symbol": "x0"}, @@ -62,21 +281,20 @@ ) -def test_get_instruction_mnemonic(): +def _TODO_test_is_stack_register_expression(): mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction()) - call = ParseDict(BE2_DICT["instruction"][1], BinExport2.Instruction()) - - assert get_instruction_mnemonic(BE2, mov) == "mov" - assert get_instruction_mnemonic(BE2, call) == "bl" - - -def test_get_instruction_operands(): - insn = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction()) - - assert len(get_instruction_operands(BE2, insn)) == 3 - + add = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction()) -def test_get_operand_expressions(): - oper = ParseDict(BE2_DICT["operand"][0], BinExport2.Operand()) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, mov) + exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0]) + assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False + exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1]) + assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False - assert len(get_operand_expressions(BE2, oper)) == 1 + ops = get_instruction_operands(BE2_EXTRACTOR.be2, add) + exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0]) + assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False + exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1]) + assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is True + exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1]) + assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False From c662176e38920b741dccca50637a664e9c94664d Mon Sep 17 00:00:00 2001 From: mr-tz Date: Tue, 3 Sep 2024 09:52:38 +0000 Subject: [PATCH 4/7] add method to get instruction by address --- .../extractors/binexport2/__init__.py | 7 +++ tests/test_binexport_accessors.py | 54 +++++++++---------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py index 76731e8ac..dd860dbf6 100644 --- a/capa/features/extractors/binexport2/__init__.py +++ b/capa/features/extractors/binexport2/__init__.py @@ -235,6 +235,13 @@ def get_function_name_by_address(self, address: int) -> str: vertex_index: int = self.vertex_index_by_address[address] return self.get_function_name_by_vertex(vertex_index) + def get_instruction_by_address(self, address: int) -> BinExport2.Instruction: + for i, be2_insn in enumerate(self.be2.instruction): + insn = self.get_insn_address(i) + if address == insn: + return be2_insn + raise ValueError(f"address 0x{address:x} not found") + class BinExport2Analysis: def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes): diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py index cef655eb0..9f61f1722 100644 --- a/tests/test_binexport_accessors.py +++ b/tests/test_binexport_accessors.py @@ -34,14 +34,14 @@ # Ghidra disassembly of c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486 GHIDRA_DISASSEMBLY = """ // - // segment_1 + // segment_1 // Loadable segment [0x200000 - 0x200157] // ram:00200000-ram:00200157 // 00200000 7f 45 4c Elf64_Ehdr ... // - // .text + // .text // SHT_PROGBITS [0x210158 - 0x2101c7] // ram:00210158-ram:002101c7 // @@ -50,9 +50,9 @@ ************************************************************** undefined entry() undefined w0:1 - _start XREF[4]: Entry Point(*), 00200018(*), - entry 002000c0(*), - _elfSectionHeaders::00000050(*) + _start XREF[4]: Entry Point(*), 00200018(*), + entry 002000c0(*), + _elfSectionHeaders::00000050(*) 00210158 20 00 80 d2 mov x0,#0x1 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 = "Hello World!\n" = 00000000002201C8h @@ -70,47 +70,47 @@ ************************************************************** undefined printString() undefined w0:1 - printString XREF[1]: entry:00210170(c) + printString XREF[1]: entry:00210170(c) 00210180 01 00 80 d2 mov x1,#0x0 - strlenLoop XREF[1]: 00210194(j) + strlenLoop XREF[1]: 00210194(j) 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] 00210188 5f 00 00 71 cmp w2,#0x0 0021018c 60 00 00 54 b.eq strlenDone 00210190 21 04 00 91 add x1,x1,#0x1 00210194 fc ff ff 17 b strlenLoop - strlenDone XREF[1]: 0021018c(j) + strlenDone XREF[1]: 0021018c(j) 00210198 e2 03 01 aa mov x2,x1 0021019c e1 03 00 aa mov x1,x0 002101a0 20 00 80 d2 mov x0,#0x1 002101a4 08 08 80 d2 mov x8,#0x40 002101a8 01 00 00 d4 svc 0x0 002101ac c0 03 5f d6 ret - DAT_002101b0 XREF[1]: entry:0021015c(R) + DAT_002101b0 XREF[1]: entry:0021015c(R) 002101b0 c8 01 22 undefined8 00000000002201C8h ? -> 002201c8 - 00 00 00 + 00 00 00 00 00 - DAT_002101b8 XREF[1]: entry:00210160(R) + DAT_002101b8 XREF[1]: entry:00210160(R) 002101b8 0e 00 00 undefined8 000000000000000Eh - 00 00 00 + 00 00 00 00 00 - DAT_002101c0 XREF[1]: entry:0021016c(R) + DAT_002101c0 XREF[1]: entry:0021016c(R) 002101c0 d6 01 22 undefined8 00000000002201D6h ? -> 002201d6 - 00 00 00 + 00 00 00 00 00 // - // .data + // .data // SHT_PROGBITS [0x2201c8 - 0x2201fb] // ram:002201c8-ram:002201fb // - helloWorldStr XREF[3]: 002000f8(*), entry:0021015c(*), - _elfSectionHeaders::00000090(*) + helloWorldStr XREF[3]: 002000f8(*), entry:0021015c(*), + _elfSectionHeaders::00000090(*) 002201c8 48 65 6c ds "Hello World!\n" - 6c 6f 20 - 57 6f 72 - $stringWith_Weird_Name XREF[1]: entry:0021016c(*) + 6c 6f 20 + 57 6f 72 + $stringWith_Weird_Name XREF[1]: entry:0021016c(*) 002201d6 54 68 69 ds "This string has a very strange label\n" - 73 20 73 - 74 72 69 + 73 20 73 + 74 72 69 ... """ @@ -122,11 +122,11 @@ def _parse_ghidra_disassembly(disasm: str) -> dict: # address bytes mnemonic o1,o2 (,o3) pattern = re.compile( r"^( ){8}(?P
[0-9a-f]+) " - "(?P([0-9a-f]{2}[ ]){4})\s+" - "(?P[\w\.]+)\s*" - "(?P[\w#$=>]+)?,?" - "((?P[\w#$=>]+))?,?" - "((?P[\w#$=>]+))?" + + r"(?P([0-9a-f]{2}[ ]){4})\s+" + + r"(?P[\w\.]+)\s*" + + r"(?P[\w#$=>]+)?,?" + + r"((?P[\w#$=>]+))?,?" + + r"((?P[\w#$=>]+))?" ) for line in disasm.splitlines()[20:]: m = pattern.match(line) From bf38f225af9376f83966accb341b1d1682ebf6a9 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Wed, 4 Sep 2024 08:31:13 +0000 Subject: [PATCH 5/7] index instructions by address --- capa/features/extractors/binexport2/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py index dd860dbf6..cfe926d8f 100644 --- a/capa/features/extractors/binexport2/__init__.py +++ b/capa/features/extractors/binexport2/__init__.py @@ -104,6 +104,7 @@ def __init__(self, be2: BinExport2): self.string_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list) self.insn_address_by_index: Dict[int, int] = {} + self.insn_by_address: Dict[int, BinExport2.Instruction] = {} # must index instructions first self._index_insn_addresses() @@ -186,6 +187,7 @@ def _index_insn_addresses(self): addr = next_addr next_addr += len(insn.raw_bytes) self.insn_address_by_index[idx] = addr + self.insn_by_address[addr] = insn @staticmethod def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]: @@ -236,11 +238,8 @@ def get_function_name_by_address(self, address: int) -> str: return self.get_function_name_by_vertex(vertex_index) def get_instruction_by_address(self, address: int) -> BinExport2.Instruction: - for i, be2_insn in enumerate(self.be2.instruction): - insn = self.get_insn_address(i) - if address == insn: - return be2_insn - raise ValueError(f"address 0x{address:x} not found") + assert address in self.insn_by_address, f"address must be indexed, missing {address:x}" + return self.insn_by_address[address] class BinExport2Analysis: From 3c97edc8d2c87051ac1616d15d9ac24071f9f268 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Wed, 4 Sep 2024 12:40:15 +0000 Subject: [PATCH 6/7] adjust and extend tests --- tests/test_binexport_accessors.py | 124 +++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 38 deletions(-) diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py index 9f61f1722..5097f4d90 100644 --- a/tests/test_binexport_accessors.py +++ b/tests/test_binexport_accessors.py @@ -177,47 +177,93 @@ def test_get_instruction_operands_count(): @pytest.mark.parametrize( - "addr,op_expressions", + "addr,expressions", [ # 00210158 20 00 80 d2 mov x0,#0x1 - (0x210158, ("x0", "#0x1")), + ( + 0x210158, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"), + BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1), + ), + ), # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 - (0x21015C, ("x1", "DAT_002101b0")), + ( + 0x21015C, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression( + type=BinExport2.Expression.IMMEDIATE_INT, symbol="DAT_002101b0", immediate=0x2101B0 + ), + ), + ), # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] - (0x210184, ("w2", "[x0, x1, LSL ]")), + # ^^^ issue in Ghidra? + # IDA gives LDRB W2, [X0,X1] + # still need to test/handle this and it's the only complex operand expression in this test binary :/ + ( + 0x210184, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="w2"), + ( + BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="["), + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"), + BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","), + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","), + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="LSL"), + BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="]"), + ), + ), + ), # 00210190 21 04 00 91 add x1,x1,#0x1 - (0x210190, ("x1", "x1", "#0x1")), + ( + 0x210190, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1), + ), + ), ], ) -def test_get_operand_expressions(addr, op_expressions): +def test_get_operand_expressions(addr, expressions): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) for i, op in enumerate(ops): + op_expression = expressions[i] exps = get_operand_expressions(BE2_EXTRACTOR.be2, op) - assert len(exps) == 1 - assert exps[0].symbol == op_expressions[i] + if len(exps) > 1: + for j, exp in enumerate(exps): + assert exp.type == op_expression[j].type + assert exp.symbol == op_expression[j].symbol + else: + assert len(exps) == 1 + assert exps[0] == op_expression @pytest.mark.parametrize( - "addr,reg_expressions", + "addr,expressions", [ # 00210158 20 00 80 d2 mov x0,#0x1 (0x210158, ("x0", None)), # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 (0x21015C, ("x1", None)), - # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] - (0x210184, ("w2", None)), + # 0021019c e1 03 00 aa mov x1,x0 + (0x21019C, ("x1", "x0")), # 00210190 21 04 00 91 add x1,x1,#0x1 (0x210190, ("x1", "x1", None)), ], ) -def _TODO_test_get_operand_register_expression(addr, reg_expressions): +def test_get_operand_register_expression(addr, expressions): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) for i, op in enumerate(ops): reg_exp = get_operand_register_expression(BE2_EXTRACTOR.be2, op) - logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op)) - assert reg_exp == reg_expressions[i] + if reg_exp is None: + assert reg_exp == expressions[i] + else: + assert reg_exp.symbol == expressions[i] @pytest.mark.parametrize( @@ -226,20 +272,22 @@ def _TODO_test_get_operand_register_expression(addr, reg_expressions): # 00210158 20 00 80 d2 mov x0,#0x1 (0x210158, (None, 0x1)), # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 - (0x21015C, (None, None)), - # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] - (0x210184, (None, None)), + (0x21015C, (None, 0x2101B0)), + # 002101a8 01 00 00 d4 svc 0x0 + (0x2101A8, (0x0,)), # 00210190 21 04 00 91 add x1,x1,#0x1 (0x210190, (None, None, 0x1)), ], ) -def _TODO_test_get_operand_immediate_expression(addr, expressions): +def test_get_operand_immediate_expression(addr, expressions): insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) for i, op in enumerate(ops): reg_exp = get_operand_immediate_expression(BE2_EXTRACTOR.be2, op) - logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op)) - assert reg_exp == expressions[i] + if reg_exp is None: + assert reg_exp == expressions[i] + else: + assert reg_exp.immediate == expressions[i] """ @@ -249,11 +297,11 @@ def _TODO_test_get_operand_immediate_expression(addr, expressions): """ BE2_DICT: Dict[str, Any] = { "expression": [ - {"type": 1, "symbol": "x0"}, - {"type": 2, "immediate": 0x20}, - {"type": 3, "immediate": 0x100}, - {"type": 1, "symbol": "sp"}, - {"type": 3, "immediate": 0x10}, + {"type": BinExport2.Expression.REGISTER, "symbol": "x0"}, + {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x20}, + {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x100}, + {"type": BinExport2.Expression.REGISTER, "symbol": "sp"}, + {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x10}, ], # operand consists of 1 or more expressions, linked together as a tree "operand": [ @@ -281,20 +329,20 @@ def _TODO_test_get_operand_immediate_expression(addr, expressions): ) -def _TODO_test_is_stack_register_expression(): +def test_is_stack_register_expression(): mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction()) add = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction()) - ops = get_instruction_operands(BE2_EXTRACTOR.be2, mov) - exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0]) - assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False - exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1]) - assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False + mov_op0, mov_op1 = get_instruction_operands(BE2, mov) + op0_exp0 = get_operand_expressions(BE2, mov_op0)[0] + assert is_stack_register_expression(BE2, op0_exp0) is False + op0_exp1 = get_operand_expressions(BE2, mov_op1)[0] + assert is_stack_register_expression(BE2, op0_exp1) is False - ops = get_instruction_operands(BE2_EXTRACTOR.be2, add) - exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0]) - assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False - exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1]) - assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is True - exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1]) - assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False + add_op0, add_op1, add_op2 = get_instruction_operands(BE2, add) + op0_exp0 = get_operand_expressions(BE2, add_op0)[0] + assert is_stack_register_expression(BE2, op0_exp0) is False + op1_exp0 = get_operand_expressions(BE2, add_op1)[0] + assert is_stack_register_expression(BE2, op1_exp0) is True + op2_exp0 = get_operand_expressions(BE2, add_op2)[0] + assert is_stack_register_expression(BE2, op2_exp0) is False From 7142bf70e8a534351ce0c75f12aef5c5ca54a136 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Wed, 4 Sep 2024 12:41:36 +0000 Subject: [PATCH 7/7] handle operator with no children bug --- capa/features/extractors/binexport2/helpers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py index 3bad3162c..a6fd5827d 100644 --- a/capa/features/extractors/binexport2/helpers.py +++ b/capa/features/extractors/binexport2/helpers.py @@ -102,7 +102,14 @@ def _get_operand_expression_list( elif expression.type == BinExport2.Expression.OPERATOR: - if len(children_tree_indexes) == 1: + if len(children_tree_indexes) == 0: + # TODO(mr-tz): Ghidra bug? + # https://github.com/mandiant/capa/pull/2340 + # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + # ^^^ + pass + + elif len(children_tree_indexes) == 1: # prefix operator, like "ds:" expression_list.append(expression) child_index = children_tree_indexes[0]