Skip to content

Commit

Permalink
add some scripts for reversing file formats
Browse files Browse the repository at this point in the history
  • Loading branch information
8051Enthusiast committed Apr 24, 2024
1 parent 4a89a13 commit c016994
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 0 deletions.
114 changes: 114 additions & 0 deletions scripts/find_blocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python3
# In some file formats, the data is divided into blocks, with each block
# having a checksum appended to it. This means that, given a list of
# regions with valid checksums, in those formats we will find a higher
# count of regions where the end of a region is a fixed distance from the
# start of another region (the block gap).
# This script takes a list of checksum algorithms and a list of files and
# attempts to find the most common block gaps in the files, using the
# geometric mean of the number of blocks with a given gap size to combine
# the scores from multiple files.
import numpy as np
from scipy import signal
import subprocess
import os
import sys
import json
import pathlib
import argparse

try:
delsum_path = os.environ['DELSUM_PATH']
except KeyError:
delsum_path = 'delsum'

# Takes a file and a model and returns the delsum output as a dictionary
# with models as keys and returning a list of start/end pairs of lists
def delsum(file, model):
args = [delsum_path, 'part', '-j', '-s', '-p', '-t0']
if isinstance(model, pathlib.Path):
args.extend(['-M', model])
else:
args.extend(['-m', model])
args.append(file)

output = subprocess.run(args, capture_output=True, text=True)
if output.returncode != 0:
raise ValueError(f'Error running delsum: {output.stderr}')
return json.loads(output.stdout)

# Returns a list containing, for each gap size corresponding to the current
# index, the number of blocks that have a gap of that size
def correlate(size, file_model_data):
starts = np.zeros(size, dtype=np.float32)
ends = np.zeros(size, dtype=np.float32)
for segs in file_model_data:
for start in segs["start"]:
starts[start] = 1
for end in segs["end"]:
ends[end] = 1
res = np.round(signal.correlate(starts, ends, mode='full'))
# the output of delsum are inclusive ranges, so we effectively
# subtract 1 from the gap sizes here to make them exclusive
# because the middle is at size - 1
return res[size:]

def find_blocks_for_model(sizes, model_data, top):
# calculate the geometric mean of the scores
scores = np.ones(np.max(sizes) - 1, dtype=np.float64)
for (size, data) in zip(sizes, model_data):
# make sure that zeros are not included in the geometric mean
scores[:size - 1] *= correlate(size, data) + 1
scores = np.power(scores, 1/len(sizes)) - 1
top_idx = np.argsort(scores)[::-1][:top]
return (top_idx, scores[top_idx])

# Given a list of files and a list of models, score each gap width, combining
# the scores from multiple files using the geometric mean and then return the
# top `top` gap widths for each model
def find_blocks(files, model, top):
sizes = []
data = []
for file in files:
try:
size = os.path.getsize(file)
data.append(delsum(file, model))
sizes.append(size)
except Exception as e:
print(f'Error processing {file}: {e}, skipping...', file=sys.stderr)
models = list(data[0].keys())
scores = []
for model in models:
scores.append(find_blocks_for_model(sizes, [d[model] for d in data], top))
top_scores = [s[1][0] for s in scores]
idx = np.argsort(top_scores)[::-1]
scores_sorted = [scores[i] for i in idx]
models_sorted = [models[i] for i in idx]
return (models_sorted, scores_sorted)

def main():
parser = argparse.ArgumentParser(description='Find checksummed blocks in a file')
parser.add_argument('filenames', type=pathlib.Path, help='Files to search for blocks', nargs='+')
parser.add_argument('-m', '--model', type=str, help='Model to use for checksumming')
parser.add_argument('-M', '--model-file', type=pathlib.Path,
help='File containing models to use for checksumming')
parser.add_argument('-t', '--top', type=int, default=3,
help='Number of top block gaps to display')

args = parser.parse_args()

match (args.model, args.model_file):
case (model, None) | (None, model):
(models, scores) = find_blocks(args.filename, model, args.top)
case (None, None):
raise ValueError('Must specify either a model or a model file')
case (_, _):
raise ValueError('Must specify only one of model or model file')

for (model, score) in zip(models, scores):
print(f'Model: {model}')
for (idx, s) in zip(*score):
print(f'Block gap: {idx}, Score: {s:.3f}')

if __name__ == '__main__':
main()
55 changes: 55 additions & 0 deletions scripts/show_blocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
import argparse
from find_blocks import delsum
import pathlib

parser = argparse.ArgumentParser(description='Show checksummed blocks in a file')
parser.add_argument('filename', type=pathlib.Path, help='File to show blocks from')
parser.add_argument('-m', '--model', type=str, help='Model to use for checksumming', required=True)
parser.add_argument('-g', '--gap', type=int, help='Block gap to display', required=True)

args = parser.parse_args()

model = args.model
filename = args.filename
data = delsum(filename, model)[model]
gap = args.gap

# keep track of all ends so we can quickly find the corresponding start
ends = {}
for (i, seg) in enumerate(data):
for end in seg["end"]:
ends[end] = i


# write down all starts and ends that are adjacent to a gap
block_starts = [set() for _ in range(len(data))]
block_ends = [set() for _ in range(len(data))]

for (i, seg) in enumerate(data):
for start in seg["start"]:
end_addr = start - (gap + 1)
if end_addr in ends:
block_starts[i].add(start)
block_ends[ends[end_addr]].add(end_addr)

num_digits = len(hex(max(ends.keys()))) - 2

for (orig, starts, ends) in zip(data, block_starts, block_ends):
if len(starts) == len(ends) == 0:
continue
all_starts = starts
all_ends = ends
orig_starts = orig["start"]
orig_ends = orig["end"]
# if one side was part of a gap, include all matching ends
# from the other side, but only if only one side is part of a gap
if len(starts) > 0 and len(ends) == 0:
minimum = min(starts)
all_ends = all_ends | {end for end in orig_ends if end > minimum}
elif len(ends) > 0 and len(starts) == 0:
maximum = max(ends)
all_starts = all_starts | {start for start in orig_starts if start < maximum}
start_list = ','.join(f'{s:0{num_digits}x}' for s in sorted(all_starts))
end_list = ','.join(f'{e:0{num_digits}x}' for e in sorted(all_ends))
print(f'{start_list}:{end_list}')

0 comments on commit c016994

Please sign in to comment.