From c0169945990a13325d6b491a5c4b296e43da3845 Mon Sep 17 00:00:00 2001
From: 8051Enthusiast <8051enthusiast@protonmail.com>
Date: Wed, 24 Apr 2024 16:29:38 +0200
Subject: [PATCH] add some scripts for reversing file formats

---
 scripts/find_blocks.py | 114 +++++++++++++++++++++++++++++++++++++++++
 scripts/show_blocks.py |  55 ++++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100755 scripts/find_blocks.py
 create mode 100755 scripts/show_blocks.py

diff --git a/scripts/find_blocks.py b/scripts/find_blocks.py
new file mode 100755
index 0000000..eb96f7e
--- /dev/null
+++ b/scripts/find_blocks.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# In some file formats, the data is divided into blocks, with each block
+# having a checksum appended to it. This means that, given a list of
+# regions with valid checksums, in those formats we will find a higher
+# count of regions where the end of a region is a fixed distance from the
+# start of another region (the block gap).
+# This script takes a list of checksum algorithms and a list of files and
+# attempts to find the most common block gaps in the files, using the
+# geometric mean of the number of blocks with a given gap size to combine
+# the scores from multiple files.
+import numpy as np
+from scipy import signal
+import subprocess
+import os
+import sys
+import json
+import pathlib
+import argparse
+
+try:
+    delsum_path = os.environ['DELSUM_PATH']
+except KeyError:
+    delsum_path = 'delsum'
+
+# Takes a file and a model and returns the delsum output as a dictionary
+# with models as keys and returning a list of start/end pairs of lists
+def delsum(file, model):
+    args = [delsum_path, 'part', '-j', '-s', '-p', '-t0']
+    if isinstance(model, pathlib.Path):
+        args.extend(['-M', model])
+    else:
+        args.extend(['-m', model])
+    args.append(file)
+
+    output = subprocess.run(args, capture_output=True, text=True)
+    if output.returncode != 0:
+        raise ValueError(f'Error running delsum: {output.stderr}')
+    return json.loads(output.stdout)
+
+# Returns a list containing, for each gap size corresponding to the current
+# index, the number of blocks that have a gap of that size
+def correlate(size, file_model_data):
+    starts = np.zeros(size, dtype=np.float32)
+    ends = np.zeros(size, dtype=np.float32)
+    for segs in file_model_data:
+        for start in segs["start"]:
+            starts[start] = 1
+        for end in segs["end"]:
+            ends[end] = 1
+    res = np.round(signal.correlate(starts, ends, mode='full'))
+    # the output of delsum are inclusive ranges, so we effectively
+    # subtract 1 from the gap sizes here to make them exclusive
+    # because the middle is at size - 1
+    return res[size:]
+
+def find_blocks_for_model(sizes, model_data, top):
+    # calculate the geometric mean of the scores
+    scores = np.ones(np.max(sizes) - 1, dtype=np.float64)
+    for (size, data) in zip(sizes, model_data):
+        # make sure that zeros are not included in the geometric mean
+        scores[:size - 1] *= correlate(size, data) + 1
+    scores = np.power(scores, 1/len(sizes)) - 1
+    top_idx = np.argsort(scores)[::-1][:top]
+    return (top_idx, scores[top_idx])
+
+# Given a list of files and a list of models, score each gap width, combining
+# the scores from multiple files using the geometric mean and then return the
+# top `top` gap widths for each model
+def find_blocks(files, model, top):
+    sizes = []
+    data = []
+    for file in files:
+        try:
+            size = os.path.getsize(file)
+            data.append(delsum(file, model))
+            sizes.append(size)
+        except Exception as e:
+            print(f'Error processing {file}: {e}, skipping...', file=sys.stderr)
+    models = list(data[0].keys())
+    scores = []
+    for model in models:
+        scores.append(find_blocks_for_model(sizes, [d[model] for d in data], top))
+    top_scores = [s[1][0] for s in scores]
+    idx = np.argsort(top_scores)[::-1]
+    scores_sorted = [scores[i] for i in idx]
+    models_sorted = [models[i] for i in idx]
+    return (models_sorted, scores_sorted)
+ 
+def main():
+    parser = argparse.ArgumentParser(description='Find checksummed blocks in a file')
+    parser.add_argument('filenames', type=pathlib.Path, help='Files to search for blocks', nargs='+')
+    parser.add_argument('-m', '--model', type=str, help='Model to use for checksumming')
+    parser.add_argument('-M', '--model-file', type=pathlib.Path,
+                        help='File containing models to use for checksumming')
+    parser.add_argument('-t', '--top', type=int, default=3,
+                        help='Number of top block gaps to display')
+
+    args = parser.parse_args()
+    
+    match (args.model, args.model_file):
+        case (model, None) | (None, model):
+            (models, scores) = find_blocks(args.filename, model, args.top)
+        case (None, None):
+            raise ValueError('Must specify either a model or a model file')
+        case (_, _):
+            raise ValueError('Must specify only one of model or model file')
+    
+    for (model, score) in zip(models, scores):
+        print(f'Model: {model}')
+        for (idx, s) in zip(*score):
+            print(f'Block gap: {idx}, Score: {s:.3f}')
+    
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/scripts/show_blocks.py b/scripts/show_blocks.py
new file mode 100755
index 0000000..6a63460
--- /dev/null
+++ b/scripts/show_blocks.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+import argparse
+from find_blocks import delsum
+import pathlib
+
+parser = argparse.ArgumentParser(description='Show checksummed blocks in a file')
+parser.add_argument('filename', type=pathlib.Path, help='File to show blocks from')
+parser.add_argument('-m', '--model', type=str, help='Model to use for checksumming', required=True)
+parser.add_argument('-g', '--gap', type=int, help='Block gap to display', required=True)
+
+args = parser.parse_args()
+
+model = args.model
+filename = args.filename
+data = delsum(filename, model)[model]
+gap = args.gap
+
+# keep track of all ends so we can quickly find the corresponding start
+ends = {}
+for (i, seg) in enumerate(data):
+    for end in seg["end"]:
+        ends[end] = i
+
+
+# write down all starts and ends that are adjacent to a gap
+block_starts = [set() for _ in range(len(data))]
+block_ends = [set() for _ in range(len(data))]
+
+for (i, seg) in enumerate(data):
+    for start in seg["start"]:
+        end_addr = start - (gap + 1)
+        if end_addr in ends:
+            block_starts[i].add(start)
+            block_ends[ends[end_addr]].add(end_addr)
+
+num_digits = len(hex(max(ends.keys()))) - 2
+
+for (orig, starts, ends) in zip(data, block_starts, block_ends):
+    if len(starts) == len(ends) == 0:
+        continue
+    all_starts = starts
+    all_ends = ends
+    orig_starts = orig["start"]
+    orig_ends = orig["end"]
+    # if one side was part of a gap, include all matching ends
+    # from the other side, but only if only one side is part of a gap
+    if len(starts) > 0 and len(ends) == 0:
+        minimum = min(starts)
+        all_ends = all_ends | {end for end in orig_ends if end > minimum}
+    elif len(ends) > 0 and len(starts) == 0:
+        maximum = max(ends)
+        all_starts = all_starts | {start for start in orig_starts if start < maximum}
+    start_list = ','.join(f'{s:0{num_digits}x}' for s in sorted(all_starts))
+    end_list = ','.join(f'{e:0{num_digits}x}' for e in sorted(all_ends))
+    print(f'{start_list}:{end_list}')