From c0169945990a13325d6b491a5c4b296e43da3845 Mon Sep 17 00:00:00 2001 From: 8051Enthusiast <8051enthusiast@protonmail.com> Date: Wed, 24 Apr 2024 16:29:38 +0200 Subject: [PATCH] add some scripts for reversing file formats --- scripts/find_blocks.py | 114 +++++++++++++++++++++++++++++++++++++++++ scripts/show_blocks.py | 55 ++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100755 scripts/find_blocks.py create mode 100755 scripts/show_blocks.py diff --git a/scripts/find_blocks.py b/scripts/find_blocks.py new file mode 100755 index 0000000..eb96f7e --- /dev/null +++ b/scripts/find_blocks.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# In some file formats, the data is divided into blocks, with each block +# having a checksum appended to it. This means that, given a list of +# regions with valid checksums, in those formats we will find a higher +# count of regions where the end of a region is a fixed distance from the +# start of another region (the block gap). +# This script takes a list of checksum algorithms and a list of files and +# attempts to find the most common block gaps in the files, using the +# geometric mean of the number of blocks with a given gap size to combine +# the scores from multiple files. +import numpy as np +from scipy import signal +import subprocess +import os +import sys +import json +import pathlib +import argparse + +try: + delsum_path = os.environ['DELSUM_PATH'] +except KeyError: + delsum_path = 'delsum' + +# Takes a file and a model and returns the delsum output as a dictionary +# with models as keys and returning a list of start/end pairs of lists +def delsum(file, model): + args = [delsum_path, 'part', '-j', '-s', '-p', '-t0'] + if isinstance(model, pathlib.Path): + args.extend(['-M', model]) + else: + args.extend(['-m', model]) + args.append(file) + + output = subprocess.run(args, capture_output=True, text=True) + if output.returncode != 0: + raise ValueError(f'Error running delsum: {output.stderr}') + return json.loads(output.stdout) + +# Returns a list containing, for each gap size corresponding to the current +# index, the number of blocks that have a gap of that size +def correlate(size, file_model_data): + starts = np.zeros(size, dtype=np.float32) + ends = np.zeros(size, dtype=np.float32) + for segs in file_model_data: + for start in segs["start"]: + starts[start] = 1 + for end in segs["end"]: + ends[end] = 1 + res = np.round(signal.correlate(starts, ends, mode='full')) + # the output of delsum are inclusive ranges, so we effectively + # subtract 1 from the gap sizes here to make them exclusive + # because the middle is at size - 1 + return res[size:] + +def find_blocks_for_model(sizes, model_data, top): + # calculate the geometric mean of the scores + scores = np.ones(np.max(sizes) - 1, dtype=np.float64) + for (size, data) in zip(sizes, model_data): + # make sure that zeros are not included in the geometric mean + scores[:size - 1] *= correlate(size, data) + 1 + scores = np.power(scores, 1/len(sizes)) - 1 + top_idx = np.argsort(scores)[::-1][:top] + return (top_idx, scores[top_idx]) + +# Given a list of files and a list of models, score each gap width, combining +# the scores from multiple files using the geometric mean and then return the +# top `top` gap widths for each model +def find_blocks(files, model, top): + sizes = [] + data = [] + for file in files: + try: + size = os.path.getsize(file) + data.append(delsum(file, model)) + sizes.append(size) + except Exception as e: + print(f'Error processing {file}: {e}, skipping...', file=sys.stderr) + models = list(data[0].keys()) + scores = [] + for model in models: + scores.append(find_blocks_for_model(sizes, [d[model] for d in data], top)) + top_scores = [s[1][0] for s in scores] + idx = np.argsort(top_scores)[::-1] + scores_sorted = [scores[i] for i in idx] + models_sorted = [models[i] for i in idx] + return (models_sorted, scores_sorted) + +def main(): + parser = argparse.ArgumentParser(description='Find checksummed blocks in a file') + parser.add_argument('filenames', type=pathlib.Path, help='Files to search for blocks', nargs='+') + parser.add_argument('-m', '--model', type=str, help='Model to use for checksumming') + parser.add_argument('-M', '--model-file', type=pathlib.Path, + help='File containing models to use for checksumming') + parser.add_argument('-t', '--top', type=int, default=3, + help='Number of top block gaps to display') + + args = parser.parse_args() + + match (args.model, args.model_file): + case (model, None) | (None, model): + (models, scores) = find_blocks(args.filename, model, args.top) + case (None, None): + raise ValueError('Must specify either a model or a model file') + case (_, _): + raise ValueError('Must specify only one of model or model file') + + for (model, score) in zip(models, scores): + print(f'Model: {model}') + for (idx, s) in zip(*score): + print(f'Block gap: {idx}, Score: {s:.3f}') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/show_blocks.py b/scripts/show_blocks.py new file mode 100755 index 0000000..6a63460 --- /dev/null +++ b/scripts/show_blocks.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import argparse +from find_blocks import delsum +import pathlib + +parser = argparse.ArgumentParser(description='Show checksummed blocks in a file') +parser.add_argument('filename', type=pathlib.Path, help='File to show blocks from') +parser.add_argument('-m', '--model', type=str, help='Model to use for checksumming', required=True) +parser.add_argument('-g', '--gap', type=int, help='Block gap to display', required=True) + +args = parser.parse_args() + +model = args.model +filename = args.filename +data = delsum(filename, model)[model] +gap = args.gap + +# keep track of all ends so we can quickly find the corresponding start +ends = {} +for (i, seg) in enumerate(data): + for end in seg["end"]: + ends[end] = i + + +# write down all starts and ends that are adjacent to a gap +block_starts = [set() for _ in range(len(data))] +block_ends = [set() for _ in range(len(data))] + +for (i, seg) in enumerate(data): + for start in seg["start"]: + end_addr = start - (gap + 1) + if end_addr in ends: + block_starts[i].add(start) + block_ends[ends[end_addr]].add(end_addr) + +num_digits = len(hex(max(ends.keys()))) - 2 + +for (orig, starts, ends) in zip(data, block_starts, block_ends): + if len(starts) == len(ends) == 0: + continue + all_starts = starts + all_ends = ends + orig_starts = orig["start"] + orig_ends = orig["end"] + # if one side was part of a gap, include all matching ends + # from the other side, but only if only one side is part of a gap + if len(starts) > 0 and len(ends) == 0: + minimum = min(starts) + all_ends = all_ends | {end for end in orig_ends if end > minimum} + elif len(ends) > 0 and len(starts) == 0: + maximum = max(ends) + all_starts = all_starts | {start for start in orig_starts if start < maximum} + start_list = ','.join(f'{s:0{num_digits}x}' for s in sorted(all_starts)) + end_list = ','.join(f'{e:0{num_digits}x}' for e in sorted(all_ends)) + print(f'{start_list}:{end_list}')