From 89cd491fc76d6ba9ec5a924b646033c4a2945bf2 Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Fri, 7 Apr 2023 17:37:41 +0200 Subject: [PATCH] feat(processing): pattern auto-identification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrate pattern recognition for unknown chunks in order to help identifying parts. Here we simply detect padding, but this could be extended in the future to detect re-occuring patterns, encrypted content, or even fingerprints. Co-authored-by: KrisztiƔn Fekete <1246751+e3krisztian@users.noreply.github.com> --- .../{0-1.unknown => 0-1.padding} | 0 .../{0-17.unknown => 0-17.padding} | 0 ...9-366113.unknown => 366109-366113.padding} | 0 ...9-366113.unknown => 366109-366113.padding} | 0 ...8160-32768.unknown => 28160-32768.padding} | 0 ...8160-32768.unknown => 28160-32768.padding} | 0 ...2097152.unknown => 551424-2097152.padding} | 0 tests/test_report.py | 15 ++++++---- unblob/extractor.py | 13 +++++++-- unblob/models.py | 22 ++++++++++++++ unblob/processing.py | 29 +++++++++++++++++-- 11 files changed, 69 insertions(+), 10 deletions(-) rename tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/{0-1.unknown => 0-1.padding} (100%) rename tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/{0-17.unknown => 0-17.padding} (100%) rename tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/{366109-366113.unknown => 366109-366113.padding} (100%) rename tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/{366109-366113.unknown => 366109-366113.padding} (100%) rename tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/{28160-32768.unknown => 28160-32768.padding} (100%) rename tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/{28160-32768.unknown => 28160-32768.padding} (100%) rename tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/{551424-2097152.unknown => 551424-2097152.padding} (100%) diff --git a/tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.unknown b/tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.padding similarity index 100% rename from tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.unknown rename to tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.padding diff --git a/tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.unknown b/tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.padding similarity index 100% rename from tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.unknown rename to tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.padding diff --git a/tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.unknown b/tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.padding similarity index 100% rename from tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.unknown rename to tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.padding diff --git a/tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.unknown b/tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.padding similarity index 100% rename from tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.unknown rename to tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.padding diff --git a/tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.unknown b/tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.padding similarity index 100% rename from tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.unknown rename to tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.padding diff --git a/tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.unknown b/tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.padding similarity index 100% rename from tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.unknown rename to tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.padding diff --git a/tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.unknown b/tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.padding similarity index 100% rename from tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.unknown rename to tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.padding diff --git a/tests/test_report.py b/tests/test_report.py index 3e96d73124..0a2ee0d39d 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -73,6 +73,7 @@ def hello_kitty_task_results( extract_root: Path, hello_id: str, kitty_id: str, + padding_id: str, container_id="", start_depth=0, ): @@ -133,12 +134,14 @@ def hello_kitty_task_results( size=7, entropy=None, ), - UnknownChunkReport( - id=ANY, + ChunkReport( + id=padding_id, start_offset=263, end_offset=264, size=1, - entropy=None, + handler_name="padding", + is_encrypted=False, + extraction_reports=[], ), ChunkReport( id=hello_id, @@ -286,13 +289,14 @@ def test_flat_report_structure(hello_kitty: Path, extract_root): task_results = get_normalized_task_results(process_result) # extract the ids from the chunks - hello_id, kitty_id = get_chunk_ids(task_results[0]) + padding_id, hello_id, kitty_id = get_chunk_ids(task_results[0]) assert task_results == hello_kitty_task_results( hello_kitty=hello_kitty, extract_root=extract_root, hello_id=hello_id, kitty_id=kitty_id, + padding_id=padding_id, ) @@ -416,7 +420,7 @@ def test_chunk_in_chunk_report_structure(hello_kitty_container: Path, extract_ro # and they should be the only differences [main_id] = get_chunk_ids(task_results[0]) - hello_id, kitty_id = get_chunk_ids(task_results[2]) + padding_id, hello_id, kitty_id = get_chunk_ids(task_results[2]) # We test, that the container is referenced from the internal file # through the chunk id `main_id` @@ -428,6 +432,7 @@ def test_chunk_in_chunk_report_structure(hello_kitty_container: Path, extract_ro extract_root=extract_root / "container_extract", hello_id=hello_id, kitty_id=kitty_id, + padding_id=padding_id, container_id=main_id, start_depth=1, ) diff --git a/unblob/extractor.py b/unblob/extractor.py index f78a9ee8b7..8b4aac0d29 100644 --- a/unblob/extractor.py +++ b/unblob/extractor.py @@ -2,11 +2,12 @@ import errno import os from pathlib import Path +from typing import Union from structlog import get_logger from .file_utils import carve, is_safe_path -from .models import Chunk, File, TaskResult, UnknownChunk, ValidChunk +from .models import Chunk, File, PaddingChunk, TaskResult, UnknownChunk, ValidChunk from .report import MaliciousSymlinkRemoved logger = get_logger() @@ -113,8 +114,14 @@ def _fix_extracted_directory(directory: Path): _fix_extracted_directory(outdir) -def carve_unknown_chunk(extract_dir: Path, file: File, chunk: UnknownChunk) -> Path: - filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown" +def carve_unknown_chunk( + extract_dir: Path, file: File, chunk: Union[UnknownChunk, PaddingChunk] +) -> Path: + extension = "unknown" + if isinstance(chunk, PaddingChunk): + extension = "padding" + + filename = f"{chunk.start_offset}-{chunk.end_offset}.{extension}" carve_path = extract_dir / filename logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk) carve_chunk_to_file(carve_path, file, chunk) diff --git a/unblob/models.py b/unblob/models.py index ecf218c1d9..70217c8d75 100644 --- a/unblob/models.py +++ b/unblob/models.py @@ -147,6 +147,28 @@ def as_report(self, entropy: Optional[EntropyReport]) -> UnknownChunkReport: ) +@attr.define(repr=False) +class PaddingChunk(Chunk): + r"""Gaps between valid chunks or otherwise unknown chunks. + + Important for manual analysis, and analytical certanity: for example + entropy, other chunks inside it, metadata, etc. + """ + + def as_report( + self, entropy: Optional[EntropyReport] # noqa: ARG002 + ) -> ChunkReport: + return ChunkReport( + id=self.id, + start_offset=self.start_offset, + end_offset=self.end_offset, + size=self.size, + is_encrypted=False, + handler_name="padding", + extraction_reports=[], + ) + + @attrs.define class MultiFile(Blob): name: str = attr.field(kw_only=True) diff --git a/unblob/processing.py b/unblob/processing.py index 5aefa6905a..fd38c8fd2c 100644 --- a/unblob/processing.py +++ b/unblob/processing.py @@ -2,7 +2,7 @@ import shutil from operator import attrgetter from pathlib import Path -from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type +from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type, Union import attr import magic @@ -24,6 +24,7 @@ ExtractError, File, MultiFile, + PaddingChunk, ProcessResult, Task, TaskResult, @@ -458,6 +459,29 @@ def _iterate_directory(self, extract_dirs, processed_paths): ) +def is_padding(file: File, chunk: UnknownChunk): + return len(set(file[chunk.start_offset : chunk.end_offset])) == 1 + + +def process_patterns( + unknown_chunks: List[UnknownChunk], file: File +) -> List[Union[UnknownChunk, PaddingChunk]]: + processed_chunks = [] + for unknown_chunk in unknown_chunks: + if is_padding(file, unknown_chunk): + processed_chunks.append( + PaddingChunk( + start_offset=unknown_chunk.start_offset, + end_offset=unknown_chunk.end_offset, + id=unknown_chunk.id, + file=unknown_chunk.file, + ) + ) + else: + processed_chunks.append(unknown_chunk) + return processed_chunks + + class _FileTask: def __init__( self, @@ -495,6 +519,7 @@ def process(self): ) outer_chunks = remove_inner_chunks(all_chunks) unknown_chunks = calculate_unknown_chunks(outer_chunks, self.size) + unknown_chunks = process_patterns(unknown_chunks, file) assign_file_to_chunks(outer_chunks, file=file) assign_file_to_chunks(unknown_chunks, file=file) @@ -511,7 +536,7 @@ def _process_chunks( self, file: File, outer_chunks: List[ValidChunk], - unknown_chunks: List[UnknownChunk], + unknown_chunks: List[Union[UnknownChunk, PaddingChunk]], ): if unknown_chunks: logger.warning("Found unknown Chunks", chunks=unknown_chunks)