From 34ddfb2930d8814262a9b4023d388fae34a9384d Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Mon, 3 Jun 2024 16:45:14 -0700 Subject: [PATCH] Add support for hashing files with header. Missed this in #188, but found out I need it when working on #190. The `serialize_v0`/`serialize_v1` methods all had headers in front of the files, so we need to do that too. Will update usage of header on #190 shortly. As a benefit, we can simulate hashing a file with a header for the first portion of the file and a sharded hasher for the remainder of the file. Signed-off-by: Mihai Maruseac --- model_signing/hashing/file.py | 33 ++++++++++++++++++----- model_signing/hashing/hashing.py | 7 +++-- model_signing/hashing/memory.py | 3 ++- model_signing/hashing/precomputed.py | 3 ++- model_signing/hashing/precomputed_test.py | 8 ++++++ 5 files changed, 43 insertions(+), 11 deletions(-) diff --git a/model_signing/hashing/file.py b/model_signing/hashing/file.py index a6e94ec0..092142eb 100644 --- a/model_signing/hashing/file.py +++ b/model_signing/hashing/file.py @@ -28,11 +28,31 @@ ```python >>> with open("/tmp/file", "w") as f: ... f.write("0123abcd") ->>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8) +>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8) >>> digest = hasher.compute() >>> digest.digest_hex '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589' ``` + +Similarly, we can emulate a mising header: +```python +>>> with open("/tmp/file", "w") as f: +... f.write("abcd") +>>> hasher = FileHasher("/tmp/file", SHA256()) +>>> digest = hasher.compute(header=b"0123") +>>> digest.digest_hex +'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848' +``` + +This is the same as hashing a file with the entire contents: +```python +>>> with open("/tmp/file", "w") as f: +... f.write("0123abcd") +>>> hasher = FileHasher("/tmp/file", SHA256()) +>>> digest = hasher.compute() +>>> digest.digest_hex +'64eab0705394501ced0ff991bf69077fd3846c1d964e3db28d9600891715d848' +``` """ import pathlib @@ -101,8 +121,8 @@ def digest_name(self) -> str: return f"file-{self._content_hasher.digest_name}" @override - def compute(self) -> hashing.Digest: - self._content_hasher.reset() + def compute(self, *, header: bytes = b"") -> hashing.Digest: + self._content_hasher.reset(header) if self._chunk_size == 0: with open(self._file, "rb") as f: @@ -144,8 +164,7 @@ def __init__( Args: file: The file to hash. Use `set_file` to reset it. content_hasher: A `hashing.HashEngine` instance used to compute the - digest of the file. This instance must not be used outside of this - instance. However, it may be pre-initialized with a header. + digest of the file. start: The file offset to start reading from. Must be valid. Reset with `set_shard`. end: The file offset to start reading from. Must be stricly greater @@ -195,8 +214,8 @@ def set_shard(self, *, start: int, end: int) -> None: self._end = end @override - def compute(self) -> hashing.Digest: - self._content_hasher.reset() + def compute(self, *, header: bytes = b"") -> hashing.Digest: + self._content_hasher.reset(header) with open(self._file, "rb") as f: f.seek(self._start) diff --git a/model_signing/hashing/hashing.py b/model_signing/hashing/hashing.py index f606d2f6..0a50eafa 100644 --- a/model_signing/hashing/hashing.py +++ b/model_signing/hashing/hashing.py @@ -43,8 +43,11 @@ class HashEngine(metaclass=ABCMeta): """Generic hash engine.""" @abstractmethod - def compute(self) -> Digest: - """Computes the digest of data passed to the engine.""" + def compute(self, *, header: bytes = b"") -> Digest: + """Computes the digest of data passed to the engine. + + The method supports an optional header to be hashed before the data. + """ pass @property diff --git a/model_signing/hashing/memory.py b/model_signing/hashing/memory.py index 673f0d88..76714e53 100644 --- a/model_signing/hashing/memory.py +++ b/model_signing/hashing/memory.py @@ -56,7 +56,8 @@ def reset(self, data: bytes = b"") -> None: self._hasher = hashlib.sha256(data) @override - def compute(self) -> hashing.Digest: + def compute(self, *, header: bytes = b"") -> hashing.Digest: + del header # unused with streaming digests, set in `reset` instead return hashing.Digest(self.digest_name, self._hasher.digest()) @override diff --git a/model_signing/hashing/precomputed.py b/model_signing/hashing/precomputed.py index 8e24c81b..c23f666e 100644 --- a/model_signing/hashing/precomputed.py +++ b/model_signing/hashing/precomputed.py @@ -42,7 +42,8 @@ class PrecomputedDigest(hashing.HashEngine): _digest_value: bytes @override - def compute(self) -> hashing.Digest: + def compute(self, *, header: bytes = b"") -> hashing.Digest: + del header # unused with precomputed digests return hashing.Digest(self._digest_type, self._digest_value) @override diff --git a/model_signing/hashing/precomputed_test.py b/model_signing/hashing/precomputed_test.py index 9cc7fda8..3fd8d43b 100644 --- a/model_signing/hashing/precomputed_test.py +++ b/model_signing/hashing/precomputed_test.py @@ -46,3 +46,11 @@ def test_expected_hash_type(self): assert hasher.digest_name == "test" digest = hasher.compute() assert digest.algorithm == "test" + + def test_compute_with_header(self): + hash_value = b"value" + hasher = precomputed.PrecomputedDigest("test", hash_value) + digest = hasher.compute() + assert digest.digest_value == hash_value + digest = hasher.compute(header="some data") + assert digest.digest_value == hash_value