From 24d68a4b1527ae273620d1d57530357294534aa7 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Thu, 6 Jun 2024 14:01:33 -0400 Subject: [PATCH] Add API for sharded serialization to a digest (#198) * Add API for sharded serialization to a digest. This is what used to be `serialize_v1`. Additionally, in this change we rename `serializing` to `serialization` to be gramatically correct. We expose `shard_size` and a new `digest_size` method from all hashing engines. We also make imports be more consistent. Signed-off-by: Mihai Maruseac * Change TODOs to link to issues Signed-off-by: Mihai Maruseac * Add test for fifo Signed-off-by: Mihai Maruseac * Test root as pipe too Signed-off-by: Mihai Maruseac * Merge `_get_sizes` and `_build_tasks` Signed-off-by: Mihai Maruseac --------- Signed-off-by: Mihai Maruseac --- model_signing/hashing/file.py | 14 +- model_signing/hashing/file_test.py | 66 +- model_signing/hashing/hashing.py | 27 +- model_signing/hashing/memory.py | 6 + model_signing/hashing/memory_test.py | 7 + model_signing/hashing/precomputed.py | 10 +- model_signing/hashing/precomputed_test.py | 5 + model_signing/manifest/manifest.py | 8 +- .../__init__.py | 0 model_signing/serialization/dfs.py | 294 +++++++++ model_signing/serialization/dfs_test.py | 622 ++++++++++++++++++ .../serialization.py} | 6 +- model_signing/serializing/dfs.py | 120 ---- model_signing/serializing/dfs_test.py | 300 --------- 14 files changed, 1043 insertions(+), 442 deletions(-) rename model_signing/{serializing => serialization}/__init__.py (100%) create mode 100644 model_signing/serialization/dfs.py create mode 100644 model_signing/serialization/dfs_test.py rename model_signing/{serializing/serializing.py => serialization/serialization.py} (91%) delete mode 100644 model_signing/serializing/dfs.py delete mode 100644 model_signing/serializing/dfs_test.py diff --git a/model_signing/hashing/file.py b/model_signing/hashing/file.py index ef88407d..c64b73e2 100644 --- a/model_signing/hashing/file.py +++ b/model_signing/hashing/file.py @@ -118,6 +118,12 @@ def compute(self) -> hashing.Digest: digest = self._content_hasher.compute() return hashing.Digest(self.digest_name, digest.digest_value) + @override + @property + def digest_size(self) -> int: + """The size, in bytes, of the digests produced by the engine.""" + return self._content_hasher.digest_size + class ShardedFileHasher(FileHasher): """File hash engine that can be invoked in parallel. @@ -168,7 +174,7 @@ def __init__( raise ValueError( f"Shard size must be strictly positive, got {shard_size}." ) - self._shard_size = shard_size + self.shard_size = shard_size self.set_shard(start=start, end=end) @@ -184,9 +190,9 @@ def set_shard(self, *, start: int, end: int) -> None: f" got {start=}, {end=}." ) read_length = end - start - if read_length > self._shard_size: + if read_length > self.shard_size: raise ValueError( - f"Must not read more than shard_size={self._shard_size}, got" + f"Must not read more than shard_size={self.shard_size}, got" f" {read_length}." ) @@ -219,4 +225,4 @@ def compute(self) -> hashing.Digest: def digest_name(self) -> str: if self._digest_name_override is not None: return self._digest_name_override - return f"file-{self._content_hasher.digest_name}-{self._shard_size}" + return f"file-{self._content_hasher.digest_name}-{self.shard_size}" diff --git a/model_signing/hashing/file_test.py b/model_signing/hashing/file_test.py index e6f00d67..ceddf376 100644 --- a/model_signing/hashing/file_test.py +++ b/model_signing/hashing/file_test.py @@ -83,6 +83,12 @@ def test_hash_of_known_file_small_chunk(self, sample_file, expected_digest): digest = hasher.compute() assert digest.digest_hex == expected_digest + def test_hash_of_known_file_large_chunk(self, sample_file, expected_digest): + size = 2 * len(_FULL_CONTENT) + hasher = file.FileHasher(sample_file, memory.SHA256(), chunk_size=size) + digest = hasher.compute() + assert digest.digest_hex == expected_digest + def test_hash_file_twice(self, sample_file): hasher1 = file.FileHasher(sample_file, memory.SHA256()) digest1 = hasher1.compute() @@ -113,7 +119,7 @@ def test_set_file(self, sample_file, sample_file_content_only): assert digest1.digest_value == digest2.digest_value def test_default_digest_name(self): - hasher = file.FileHasher("unused", memory.SHA256(), chunk_size=10) + hasher = file.FileHasher("unused", memory.SHA256()) assert hasher.digest_name == "file-sha256" def test_override_digest_name(self): @@ -130,6 +136,11 @@ def test_digest_algorithm_is_digest_name(self, sample_file): digest = hasher.compute() assert digest.algorithm == hasher.digest_name + def test_digest_size(self): + memory_hasher = memory.SHA256() + hasher = file.FileHasher(sample_file, memory_hasher) + assert hasher.digest_size == memory_hasher.digest_size + class TestShardedFileHasher: @@ -304,6 +315,54 @@ def test_hash_of_known_file_small_chunk( digest2 = hasher2.compute() assert digest2.digest_hex == expected_content_digest + def test_hash_of_known_file_large_chunk( + self, sample_file, expected_header_digest, expected_content_digest + ): + hasher1 = file.ShardedFileHasher( + sample_file, + memory.SHA256(), + start=0, + end=_SHARD_SIZE, + chunk_size=2 * len(_FULL_CONTENT), + ) + hasher2 = file.ShardedFileHasher( + sample_file, + memory.SHA256(), + start=_SHARD_SIZE, + end=2 * _SHARD_SIZE, + chunk_size=2 * len(_FULL_CONTENT), + ) + + digest1 = hasher1.compute() + assert digest1.digest_hex == expected_header_digest + + digest2 = hasher2.compute() + assert digest2.digest_hex == expected_content_digest + + def test_hash_of_known_file_large_shard( + self, sample_file, expected_header_digest, expected_content_digest + ): + hasher1 = file.ShardedFileHasher( + sample_file, + memory.SHA256(), + start=0, + end=_SHARD_SIZE, + shard_size=2 * len(_FULL_CONTENT), + ) + hasher2 = file.ShardedFileHasher( + sample_file, + memory.SHA256(), + start=_SHARD_SIZE, + end=2 * _SHARD_SIZE, + shard_size=2 * len(_FULL_CONTENT), + ) + + digest1 = hasher1.compute() + assert digest1.digest_hex == expected_header_digest + + digest2 = hasher2.compute() + assert digest2.digest_hex == expected_content_digest + def test_default_digest_name(self): hasher = file.ShardedFileHasher( "unused", memory.SHA256(), start=0, end=2, shard_size=10 @@ -332,3 +391,8 @@ def test_digest_algorithm_is_digest_name(self, sample_file): ) digest = hasher.compute() assert digest.algorithm == hasher.digest_name + + def test_digest_size(self): + memory_hasher = memory.SHA256() + hasher = file.FileHasher(sample_file, memory_hasher) + assert hasher.digest_size == memory_hasher.digest_size diff --git a/model_signing/hashing/hashing.py b/model_signing/hashing/hashing.py index f606d2f6..85c33f18 100644 --- a/model_signing/hashing/hashing.py +++ b/model_signing/hashing/hashing.py @@ -21,12 +21,12 @@ specify the algorithm and the digest value. """ -from abc import ABCMeta, abstractmethod -from dataclasses import dataclass +import abc +import dataclasses from typing import Protocol -@dataclass(frozen=True) +@dataclasses.dataclass(frozen=True) class Digest: """A digest computed by a `HashEngine`.""" @@ -38,17 +38,22 @@ def digest_hex(self) -> str: """Hexadecimal, human readable, equivalent of `digest`.""" return self.digest_value.hex() + @property + def digest_size(self) -> int: + """The size, in bytes, of the digest.""" + return len(self.digest_value) + -class HashEngine(metaclass=ABCMeta): +class HashEngine(metaclass=abc.ABCMeta): """Generic hash engine.""" - @abstractmethod + @abc.abstractmethod def compute(self) -> Digest: """Computes the digest of data passed to the engine.""" pass @property - @abstractmethod + @abc.abstractmethod def digest_name(self) -> str: """The canonical name of the algorithm used to compute the hash. @@ -60,16 +65,22 @@ def digest_name(self) -> str: """ pass + @property + @abc.abstractmethod + def digest_size(self) -> int: + """The size, in bytes, of the digests produced by the engine.""" + pass + class Streaming(Protocol): """A protocol to support streaming data to `HashEngine` objects.""" - @abstractmethod + @abc.abstractmethod def update(self, data: bytes) -> None: """Appends additional bytes to the data to be hashed.""" pass - @abstractmethod + @abc.abstractmethod def reset(self, data: bytes = b"") -> None: """Resets the data to be hashed to the passed argument.""" pass diff --git a/model_signing/hashing/memory.py b/model_signing/hashing/memory.py index 673f0d88..55a3759b 100644 --- a/model_signing/hashing/memory.py +++ b/model_signing/hashing/memory.py @@ -63,3 +63,9 @@ def compute(self) -> hashing.Digest: @property def digest_name(self) -> str: return "sha256" + + @override + @property + def digest_size(self) -> int: + """The size, in bytes, of the digests produced by the engine.""" + return 32 diff --git a/model_signing/hashing/memory_test.py b/model_signing/hashing/memory_test.py index 3c6337c9..3b80209b 100644 --- a/model_signing/hashing/memory_test.py +++ b/model_signing/hashing/memory_test.py @@ -62,3 +62,10 @@ def test_update_after_reset(self): assert digest1.digest_hex == digest2.digest_hex assert digest1.digest_value == digest2.digest_value + + def test_digest_size(self): + hasher = memory.SHA256(b"Test string") + assert hasher.digest_size == 32 + + digest = hasher.compute() + assert digest.digest_size == 32 diff --git a/model_signing/hashing/precomputed.py b/model_signing/hashing/precomputed.py index 8e24c81b..b414f12c 100644 --- a/model_signing/hashing/precomputed.py +++ b/model_signing/hashing/precomputed.py @@ -28,13 +28,13 @@ ``` """ -from dataclasses import dataclass +import dataclasses from typing_extensions import override from model_signing.hashing import hashing -@dataclass(frozen=True) +@dataclasses.dataclass(frozen=True) class PrecomputedDigest(hashing.HashEngine): """A wrapper around digests computed by external tooling.""" @@ -49,3 +49,9 @@ def compute(self) -> hashing.Digest: @property def digest_name(self) -> str: return self._digest_type + + @override + @property + def digest_size(self) -> int: + """The size, in bytes, of the digests produced by the engine.""" + return len(self._digest_value) diff --git a/model_signing/hashing/precomputed_test.py b/model_signing/hashing/precomputed_test.py index 9cc7fda8..1060c01e 100644 --- a/model_signing/hashing/precomputed_test.py +++ b/model_signing/hashing/precomputed_test.py @@ -46,3 +46,8 @@ def test_expected_hash_type(self): assert hasher.digest_name == "test" digest = hasher.compute() assert digest.algorithm == "test" + + def test_digest_size(self): + digest = b"abcd" + hasher = precomputed.PrecomputedDigest("test", digest) + assert hasher.digest_size == len(digest) diff --git a/model_signing/manifest/manifest.py b/model_signing/manifest/manifest.py index 29cbc0d8..15870827 100644 --- a/model_signing/manifest/manifest.py +++ b/model_signing/manifest/manifest.py @@ -20,19 +20,19 @@ soon. """ -from abc import ABCMeta -from dataclasses import dataclass +import abc +import dataclasses from model_signing.hashing import hashing -class Manifest(metaclass=ABCMeta): +class Manifest(metaclass=abc.ABCMeta): """Generic manifest file to represent a model.""" pass -@dataclass +@dataclasses.dataclass class DigestManifest(Manifest): """A manifest that is just a hash.""" diff --git a/model_signing/serializing/__init__.py b/model_signing/serialization/__init__.py similarity index 100% rename from model_signing/serializing/__init__.py rename to model_signing/serialization/__init__.py diff --git a/model_signing/serialization/dfs.py b/model_signing/serialization/dfs.py new file mode 100644 index 00000000..9b4286ac --- /dev/null +++ b/model_signing/serialization/dfs.py @@ -0,0 +1,294 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model serializers that build a single hash out of a DFS traversal.""" + +import base64 +import concurrent.futures +import pathlib +from typing import Callable, Iterable, TypeAlias +from typing_extensions import override + +from model_signing.hashing import file +from model_signing.hashing import hashing +from model_signing.manifest import manifest +from model_signing.serialization import serialization + + +def _check_file_or_directory(path: pathlib.Path) -> bool: + """Checks that the given path is either a file or a directory. + + There is no support for sockets, pipes, or any other operating system + concept abstracted as a file. + + Furthermore, this would return False if the path is a broken symlink, if it + doesn't exists or if there are permission errors. + """ + if not (path.is_file() or path.is_dir()): + raise ValueError( + f"Cannot use '{path}' as file or directory. It could be a" + " special file, it could be missing, or there might be a" + " permission issue." + ) + + +def _build_header( + *, + entry_name: str, + entry_type: str, + start: int | None = None, + end: int | None = None, +) -> bytes: + """Builds a header to encode a path with given name and type. + + Args: + entry_name: The name of the entry to build the header for. + entry_type: The type of the entry (file or directory). + """ + encoded_type = entry_type.encode("utf-8") + # Prevent confusion if name has a "." inside by encoding to base64. + encoded_name = base64.b64encode(entry_name.encode("utf-8")) + + if start is not None and end is not None: + # Note: make sure to end with a ".". + encoded_range = f"{start}-{end}.".encode("utf-8") + else: + # Note: no "." at end here, it will be added by `join` on return. + encoded_range = b"" + + return b".".join([encoded_type, encoded_name, encoded_range]) + + +class DFSSerializer(serialization.Serializer): + """Serializer for a model that performs a traversal of the model directory. + + This serializer produces a single hash for the entire model. If the model is + a file, the hash is the digest of the file. If the model is a directory, we + perform a depth-first traversal of the directory, hash each individual files + and aggregate the hashes together. + """ + + def __init__( + self, + file_hasher: file.FileHasher, + merge_hasher_factory: Callable[[], hashing.StreamingHashEngine], + ): + """Initializes an instance to hash a file with a specific `HashEngine`. + + Args: + hasher: The hash engine used to hash the individual files. + merge_hasher_factory: A callable that returns a + `hashing.StreamingHashEngine` instance used to merge individual + file digests to compute an aggregate digest. + """ + self._file_hasher = file_hasher + self._merge_hasher_factory = merge_hasher_factory + + @override + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: + # TODO: #196 - Add checks to exclude symlinks if desired + _check_file_or_directory(model_path) + + if model_path.is_file(): + self._file_hasher.set_file(model_path) + return manifest.DigestManifest(self._file_hasher.compute()) + + return manifest.DigestManifest(self._dfs(model_path)) + + def _dfs(self, directory: pathlib.Path) -> hashing.Digest: + # TODO: #196 - Add support for excluded files + children = sorted([x for x in directory.iterdir()]) + + hasher = self._merge_hasher_factory() + for child in children: + _check_file_or_directory(child) + + if child.is_file(): + header = _build_header(entry_name=child.name, entry_type="file") + hasher.update(header) + self._file_hasher.set_file(child) + digest = self._file_hasher.compute() + hasher.update(digest.digest_value) + else: + header = _build_header(entry_name=child.name, entry_type="dir") + hasher.update(header) + digest = self._dfs(child) + hasher.update(digest.digest_value) + + return hasher.compute() + + +# Define type aliases for the ShardedDFSSerializer class below. +_ShardSignTask: TypeAlias = tuple[pathlib.PurePath, str, int, int] + + +def _endpoints(step: int, end: int) -> Iterable[int]: + """Yields numbers from `step` to `end` inclusive, spaced by `step`. + + Last value is always equal to `end`, even when `end` is not a multiple of + `step`. There is always a value returned. + + Examples: + ```python + >>> list(_endpoints(2, 8)) + [2, 4, 6, 8] + >>> list(_endpoints(2, 9)) + [2, 4, 6, 8, 9] + >>> list(_endpoints(2, 2)) + [2] + """ + for value in range(step, end, step): + yield value + yield end + + +class ShardedDFSSerializer(serialization.Serializer): + """DFSSerializer that uses a sharded hash engine to exploit parallelism.""" + + def __init__( + self, + file_hasher_factory: Callable[ + [pathlib.Path, int, int], file.ShardedFileHasher + ], + merge_hasher: hashing.StreamingHashEngine, + max_workers: int | None = None, + ): + """Initializes an instance to hash a file with a specific `HashEngine`. + + Args: + hasher_factory: A callable to build the hash engine used to hash + every shard of the files in the model. Because each shard is + processed in parallel, every thread needs to call the factory to + start hashing. The arguments are the file, and the endpoints of + the shard. + merge_hasher: A `hashing.StreamingHashEngine` instance used to merge + individual file digests to compute an aggregate digest. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurent.futures` library. + """ + self._file_hasher_factory = file_hasher_factory + self._merge_hasher = merge_hasher + self._max_workers = max_workers + + # Precompute some private values only once by using a mock file hasher. + # None of the arguments used to build the hasher are used. + hasher = file_hasher_factory(pathlib.Path(), 0, 1) + self._shard_size = hasher.shard_size + + @override + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: + # TODO: #196 - Add checks to exclude symlinks if desired + _check_file_or_directory(model_path) + + if model_path.is_file(): + entries = [model_path] + else: + # TODO: #200 - When Python3.12 is the minimum supported + # version, this can be replaced with `pathlib.Path.walk` for a + # clearer interface, and some speed improvement. + entries = sorted(model_path.glob("**/*")) + + tasks = self._build_tasks(entries, model_path) + + digest_len = self._merge_hasher.digest_size + digests_buffer = bytearray(len(tasks) * digest_len) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=self._max_workers + ) as tpe: + futures_dict = { + tpe.submit(self._hash_task, model_path, task): i + for i, task in enumerate(tasks) + } + for future in concurrent.futures.as_completed(futures_dict): + i = futures_dict[future] + task_digest = future.result() + + task_path, task_type, task_start, task_end = tasks[i] + header = _build_header( + entry_name=task_path.name, + entry_type=task_type, + start=task_start, + end=task_end, + ) + self._merge_hasher.reset(header) + self._merge_hasher.update(task_digest) + digest = self._merge_hasher.compute().digest_value + + start = i * digest_len + end = start + digest_len + digests_buffer[start:end] = digest + + self._merge_hasher.reset(digests_buffer) + return manifest.DigestManifest(self._merge_hasher.compute()) + + def _build_tasks( + self, paths: Iterable[pathlib.Path], root_path: pathlib.Path + ) -> list[_ShardSignTask]: + """Builds the tasks that would hash shards of files in parallel. + + Every file in `paths` is replaced by a set of tasks. Each task computes + the digest over a shard of the file. Directories result in a single + task, just to compute a digest over a header. + + To differentiate between (empty) files and directories with the same + name, every task needs to also include a header. The header needs to + include relative path to the model root, as we want to obtain the same + digest if the model is moved. + + We don't construct an enum for the type of the entry, because these will + never escape this class. + + Note that the path component of the tasks is a `pathlib.PurePath`, so + operations on it cannot touch the filesystem. + """ + # TODO: #196 - Add support for excluded files + + tasks = [] + for path in paths: + _check_file_or_directory(path) + relative_path = path.relative_to(root_path) + + if path.is_file(): + path_size = path.stat().st_size + start = 0 + for end in _endpoints(self._shard_size, path_size): + tasks.append((relative_path, "file", start, end)) + start = end + else: + tasks.append((relative_path, "dir", 0, 0)) + + return tasks + + def _hash_task( + self, model_path: pathlib.Path, task: _ShardSignTask + ) -> bytes: + """Produces the hash of the file shard included in `task`.""" + task_path, task_type, task_start, task_end = task + + # TODO: #197 - Directories don't need to use the file hasher. + # Rather than starting a process just for them, we should filter these + # ahead of time, and only use threading for file shards. For now, just + # return an empty result. + if task_type == "dir": + return b"" + + # TODO: #197 - Similarly, empty files should be hashed outside + # of a parallel task, to not waste resources. + if task_start == task_end: + return b"" + + full_path = model_path.joinpath(task_path) + hasher = self._file_hasher_factory(full_path, task_start, task_end) + return hasher.compute().digest_value diff --git a/model_signing/serialization/dfs_test.py b/model_signing/serialization/dfs_test.py new file mode 100644 index 00000000..ee84a7e5 --- /dev/null +++ b/model_signing/serialization/dfs_test.py @@ -0,0 +1,622 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pathlib +import pytest + +from model_signing.hashing import file +from model_signing.hashing import memory +from model_signing.serialization import dfs + + +# some constants used throughout testing +_KNOWN_MODEL_TEXT: bytes = b"This is a simple model" +_ANOTHER_MODEL_TEXT: bytes = b"This is another simple model" + + +# Note: Don't make fixtures with global scope as we are altering the models! +@pytest.fixture +def sample_model_file(tmp_path_factory): + file = tmp_path_factory.mktemp("model") / "file" + file.write_bytes(_KNOWN_MODEL_TEXT) + return file + + +@pytest.fixture +def empty_model_file(tmp_path_factory): + file = tmp_path_factory.mktemp("model") / "file" + file.write_bytes(b"") + return file + + +@pytest.fixture +def sample_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + + for i in range(2): + root_dir = model_root / f"d{i}" + root_dir.mkdir() + for j in range(3): + dir_file = root_dir / f"f{i}{j}" + dir_file.write_text(f"This is file f{i}{j} in d{i}.") + + for i in range(4): + root_file = model_root / f"f{i}" + root_file.write_text(f"This is file f{i} in root.") + + return model_root + + +@pytest.fixture +def empty_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + return model_root + + +@pytest.fixture +def deep_model_folder(tmp_path_factory): + model_root = tmp_path_factory.mktemp("model") / "root" + model_root.mkdir() + + current = model_root + for i in range(5): + current = current / f"d{i}" + current.mkdir() + + for i in range(4): + file = current / f"f{i}" + file.write_text(f"This is file f{i}.") + + return model_root + + +class TestDFSSerializer: + + def test_known_file(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + expected = ( + "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" + ) + assert manifest.digest.digest_hex == expected + + def test_file_hash_is_same_as_hash_of_content(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex == digest.digest_hex + + def test_file_model_hash_is_same_if_model_is_moved(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + + new_name = sample_model_file.with_name("new-file") + new_file = sample_model_file.rename(new_name) + new_manifest = serializer.serialize(new_file) + + assert manifest == new_manifest + + def test_file_model_hash_changes_if_content_changes( + self, sample_model_file + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_file) + + sample_model_file.write_bytes(_ANOTHER_MODEL_TEXT) + new_manifest = serializer.serialize(sample_model_file) + + assert manifest.digest.algorithm == new_manifest.digest.algorithm + assert manifest.digest.digest_value != new_manifest.digest.digest_value + + def test_directory_model_with_only_known_file(self, sample_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + + model = sample_model_file.parent + manifest = serializer.serialize(model) + + expected = ( + "a0865eb7e299e3bca3951e24930c56dcf1533ecff63bda06a9be67906773c628" + ) + assert manifest.digest.digest_hex == expected + + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex != digest.digest_hex + + def test_known_folder(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_folder) + expected = ( + "310af4fc4c52bf63cd1687c67076ed3e56bc5480a1b151539e6c550506ae0301" + ) + assert manifest.digest.digest_hex == expected + + def test_folder_model_hash_is_same_if_model_is_moved( + self, sample_model_folder + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(sample_model_folder) + + new_name = sample_model_folder.with_name("new-root") + new_model = sample_model_folder.rename(new_name) + new_manifest = serializer.serialize(new_model) + + assert manifest == new_manifest + + def test_empty_file(self, empty_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_file) + expected = ( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + ) + assert manifest.digest.digest_hex == expected + + def test_directory_model_with_only_empty_file(self, empty_model_file): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_file) + model = empty_model_file.parent + manifest = serializer.serialize(model) + expected = ( + "8a587b2129fdecfbea38d5152b626299f5994d9b99d36b321aea356f69b38c61" + ) + assert manifest.digest.digest_hex == expected + + def test_empty_folder(self, empty_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(empty_model_folder) + expected = ( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + ) + assert manifest.digest.digest_hex == expected + + def test_empty_folder_hashes_the_same_as_empty_file( + self, empty_model_file, empty_model_folder + ): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + folder_manifest = serializer.serialize(empty_model_folder) + file_manifest = serializer.serialize(empty_model_file) + assert ( + folder_manifest.digest.digest_hex == file_manifest.digest.digest_hex + ) + + def test_folder_model_empty_entry(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + new_empty_dir = altered_dir / "empty" + new_empty_dir.mkdir() + manifest1 = serializer.serialize(sample_model_folder) + + new_empty_dir.rmdir() + + new_empty_file = altered_dir / "empty" + new_empty_file.write_text("") + manifest2 = serializer.serialize(sample_model_folder) + + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_file(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_rename = files[0] + + new_name = file_to_rename.with_name("new-file") + file_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_dir(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + dir_to_rename = dirs[0] + + new_name = dir_to_rename.with_name("new-dir") + dir_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_replace_file_empty_folder(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Replace first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_replace = files[0] + file_to_replace.unlink() + file_to_replace.mkdir() + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_change_file(self, sample_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_change = files[0] + file_to_change.write_bytes(_KNOWN_MODEL_TEXT) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_deep_folder(self, deep_model_folder): + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + manifest = serializer.serialize(deep_model_folder) + expected = ( + "36eed9389ebbbe15ac15d33c81dabb60ccb7c945ff641d78f59db9aa9dc47ac9" + ) + assert manifest.digest.digest_hex == expected + + def test_special_file(self, sample_model_folder): + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Create a pipe in the altered_dir + pipe = altered_dir / "pipe" + + try: + os.mkfifo(pipe) + except AttributeError: + # On Windows, `os.mkfifo` does not exist (it should not). + return + + file_hasher = file.FileHasher("unused", memory.SHA256()) + serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) + + with pytest.raises( + ValueError, match="Cannot use .* as file or directory" + ): + serializer.serialize(sample_model_folder) + + # Also to the same for the pipe itself + with pytest.raises( + ValueError, match="Cannot use .* as file or directory" + ): + serializer.serialize(pipe) + + +class TestShardedDFSSerializer: + + def _hasher_factory( + self, path: pathlib.Path, start: int, end: int + ) -> file.ShardedFileHasher: + return file.ShardedFileHasher( + path, memory.SHA256(), start=start, end=end + ) + + def _hasher_factory_small_shards( + self, path: pathlib.Path, start: int, end: int + ) -> file.ShardedFileHasher: + return file.ShardedFileHasher( + path, memory.SHA256(), start=start, end=end, shard_size=2 + ) + + def test_known_file(self, sample_model_file): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(sample_model_file) + expected = ( + "2ca48c47d5311a9b2f9305519cd5f927dcef09404fc32ef7886abe8f11450eff" + ) + assert manifest.digest.digest_hex == expected + + def test_file_hash_is_not_same_as_hash_of_content(self, sample_model_file): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(sample_model_file) + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex != digest.digest_hex + + def test_file_model_hash_is_same_if_model_is_moved(self, sample_model_file): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(sample_model_file) + + new_name = sample_model_file.with_name("new-file") + new_file = sample_model_file.rename(new_name) + new_manifest = serializer.serialize(new_file) + + assert manifest == new_manifest + + def test_file_model_hash_changes_if_content_changes( + self, sample_model_file + ): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(sample_model_file) + + sample_model_file.write_bytes(_ANOTHER_MODEL_TEXT) + new_manifest = serializer.serialize(sample_model_file) + + assert manifest.digest.algorithm == new_manifest.digest.algorithm + assert manifest.digest.digest_value != new_manifest.digest.digest_value + + def test_directory_model_with_only_known_file(self, sample_model_file): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + + model = sample_model_file.parent + manifest = serializer.serialize(model) + + expected = ( + "c030412c4c9e7f46396b591b1b6c4a4e40c15d9b9ca0b3512af8b20f3219c07f" + ) + assert manifest.digest.digest_hex == expected + + digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() + assert manifest.digest.digest_hex != digest.digest_hex + + def test_known_folder(self, sample_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(sample_model_folder) + expected = ( + "d22e0441cfa5ac2bc09715ddd88c802a7f97e29c93dc50f5498bab2954958ebb" + ) + assert manifest.digest.digest_hex == expected + + def test_folder_model_hash_is_same_if_model_is_moved( + self, sample_model_folder + ): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(sample_model_folder) + + new_name = sample_model_folder.with_name("new-root") + new_model = sample_model_folder.rename(new_name) + new_manifest = serializer.serialize(new_model) + + assert manifest == new_manifest + + def test_empty_file(self, empty_model_file): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(empty_model_file) + expected = ( + "5f2d126b0d3540c17481fdf724e31cf03b4436a2ebabaa1d2e94fe09831be64d" + ) + assert manifest.digest.digest_hex == expected + + def test_directory_model_with_only_empty_file(self, empty_model_file): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(empty_model_file) + model = empty_model_file.parent + manifest = serializer.serialize(model) + expected = ( + "74e81d0062f0a0674014c2f0e4b79985d5015f98a64089e7106a44d32e9ff11f" + ) + assert manifest.digest.digest_hex == expected + + def test_empty_folder(self, empty_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(empty_model_folder) + expected = ( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + ) + assert manifest.digest.digest_hex == expected + + def test_folder_model_empty_entry(self, sample_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + new_empty_dir = altered_dir / "empty" + new_empty_dir.mkdir() + manifest1 = serializer.serialize(sample_model_folder) + + new_empty_dir.rmdir() + + new_empty_file = altered_dir / "empty" + new_empty_file.write_text("") + manifest2 = serializer.serialize(sample_model_folder) + + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_file(self, sample_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_rename = files[0] + + new_name = file_to_rename.with_name("new-file") + file_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_rename_dir(self, sample_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + dir_to_rename = dirs[0] + + new_name = dir_to_rename.with_name("new-dir") + dir_to_rename.rename(new_name) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_replace_file_empty_folder(self, sample_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Replace first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_replace = files[0] + file_to_replace.unlink() + file_to_replace.mkdir() + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_folder_model_change_file(self, sample_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest1 = serializer.serialize(sample_model_folder) + + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Alter first file in the altered_dir + files = [f for f in altered_dir.iterdir() if f.is_file()] + file_to_change = files[0] + file_to_change.write_bytes(_KNOWN_MODEL_TEXT) + + manifest2 = serializer.serialize(sample_model_folder) + assert manifest1.digest != manifest2.digest + + def test_deep_folder(self, deep_model_folder): + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest = serializer.serialize(deep_model_folder) + expected = ( + "52fa3c459aec58bc5f9702c73cb3c6b8fd19e9342aa3e4db851e1bde69ab1727" + ) + assert manifest.digest.digest_hex == expected + + def test_max_workers_does_not_change_digest(self, sample_model_folder): + serializer1 = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest1 = serializer1.serialize(sample_model_folder) + + serializer2 = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256(), max_workers=2 + ) + manifest2 = serializer2.serialize(sample_model_folder) + + assert manifest1 == manifest2 + + def test_shard_size_changes_digests(self, sample_model_folder): + serializer1 = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + manifest1 = serializer1.serialize(sample_model_folder) + + serializer2 = dfs.ShardedDFSSerializer( + self._hasher_factory_small_shards, memory.SHA256() + ) + manifest2 = serializer2.serialize(sample_model_folder) + + assert manifest1.digest.digest_value != manifest2.digest.digest_value + + def test_special_file(self, sample_model_folder): + # Alter first directory within the model + dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] + altered_dir = dirs[0] + + # Create a pipe in the altered_dir + pipe = altered_dir / "pipe" + + try: + os.mkfifo(pipe) + except AttributeError: + # On Windows, `os.mkfifo` does not exist (it should not). + return + + serializer = dfs.ShardedDFSSerializer( + self._hasher_factory, memory.SHA256() + ) + + with pytest.raises( + ValueError, match="Cannot use .* as file or directory" + ): + serializer.serialize(sample_model_folder) + + # Also to the same for the pipe itself + with pytest.raises( + ValueError, match="Cannot use .* as file or directory" + ): + serializer.serialize(pipe) diff --git a/model_signing/serializing/serializing.py b/model_signing/serialization/serialization.py similarity index 91% rename from model_signing/serializing/serializing.py rename to model_signing/serialization/serialization.py index 50c8f729..be142e77 100644 --- a/model_signing/serializing/serializing.py +++ b/model_signing/serialization/serialization.py @@ -18,16 +18,16 @@ directory, but more serializers are coming soon. """ -from abc import ABCMeta, abstractmethod +import abc import pathlib from model_signing.manifest import manifest -class Serializer(metaclass=ABCMeta): +class Serializer(metaclass=abc.ABCMeta): """Generic ML model format serializer.""" - @abstractmethod + @abc.abstractmethod def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: """Serializes the model given by the `model_path` argument.""" pass diff --git a/model_signing/serializing/dfs.py b/model_signing/serializing/dfs.py deleted file mode 100644 index 5d42c4c1..00000000 --- a/model_signing/serializing/dfs.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2024 The Sigstore Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Model serializers that build a single hash out of a DFS traversal.""" - -import base64 -import pathlib -from typing import Callable -from typing_extensions import override - -from model_signing.hashing import file -from model_signing.hashing import hashing -from model_signing.manifest import manifest -from model_signing.serializing import serializing - - -def _check_file_or_directory(path: pathlib.Path) -> bool: - """Checks that the given path is either a file or a directory. - - There is no support for sockets, pipes, or any other operating system - concept abstracted as a file. - - Furthermore, this would return False if the path is a broken symlink, if it - doesn't exists or if there are permission errors. - """ - return path.is_file() or path.is_dir() - - -def _build_header(*, entry_name: str, entry_type: str) -> bytes: - """Builds a header to encode a path with given name and type. - - Args: - entry_name: The name of the entry to build the header for. - entry_type: The type of the entry (file or directory). - """ - encoded_type = entry_type.encode("utf-8") - # Prevent confusion if name has a "." inside by encoding to base64. - encoded_name = base64.b64encode(entry_name.encode("utf-8")) - # Note: make sure to end with a ".". - return b".".join([encoded_type, encoded_name, b""]) - - -class DFSSerializer(serializing.Serializer): - """Serializer for a model that performs a traversal of the model directory. - - This serializer produces a single hash for the entire model. If the model is - a file, the hash is the digest of the file. If the model is a directory, we - perform a depth-first traversal of the directory, hash each individual files - and aggregate the hashes together. - """ - - def __init__( - self, - file_hasher: file.FileHasher, - merge_hasher_factory: Callable[[], hashing.StreamingHashEngine], - ): - """Initializes an instance to hash a file with a specific `HashEngine`. - - Args: - hasher: The hash engine used to hash the individual files. - merge_hasher_factory: A callable that returns a - `hashing.StreamingHashEngine` instance used to merge individual - file digests to compute an aggregate digest. - """ - self._file_hasher = file_hasher - self._merge_hasher_factory = merge_hasher_factory - - @override - def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: - # TODO(mihaimaruseac): Add checks to exclude symlinks if desired - if not _check_file_or_directory(model_path): - raise ValueError( - f"Cannot use '{model_path}' as file or directory. It could be a" - " special file, it could be missing, or there might be a" - " permission issue." - ) - - if model_path.is_file(): - self._file_hasher.set_file(model_path) - return manifest.DigestManifest(self._file_hasher.compute()) - - return manifest.DigestManifest(self._dfs(model_path)) - - def _dfs(self, directory: pathlib.Path) -> hashing.Digest: - # TODO(mihaimaruseac): Add support for excluded files - children = sorted([x for x in directory.iterdir()]) - - hasher = self._merge_hasher_factory() - for child in children: - if not _check_file_or_directory(child): - raise ValueError( - f"Cannot use '{child}' as file or directory. It could be a" - " special file, it could be missing, or there might be a" - " permission issue." - ) - - if child.is_file(): - header = _build_header(entry_name=child.name, entry_type="file") - hasher.update(header) - self._file_hasher.set_file(child) - digest = self._file_hasher.compute() - hasher.update(digest.digest_value) - else: - header = _build_header(entry_name=child.name, entry_type="dir") - hasher.update(header) - digest = self._dfs(child) - hasher.update(digest.digest_value) - - return hasher.compute() diff --git a/model_signing/serializing/dfs_test.py b/model_signing/serializing/dfs_test.py deleted file mode 100644 index 8ff67c20..00000000 --- a/model_signing/serializing/dfs_test.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright 2024 The Sigstore Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from model_signing.hashing import file -from model_signing.hashing import memory -from model_signing.serializing import dfs - - -# some constants used throughout testing -_KNOWN_MODEL_TEXT: bytes = b"This is a simple model" -_ANOTHER_MODEL_TEXT: bytes = b"This is another simple model" - - -# Note: Don't make fixtures with global scope as we are altering the models! -@pytest.fixture -def sample_model_file(tmp_path_factory): - file = tmp_path_factory.mktemp("model") / "file" - file.write_bytes(_KNOWN_MODEL_TEXT) - return file - - -@pytest.fixture -def empty_model_file(tmp_path_factory): - file = tmp_path_factory.mktemp("model") / "file" - file.write_bytes(b"") - return file - - -@pytest.fixture -def sample_model_folder(tmp_path_factory): - model_root = tmp_path_factory.mktemp("model") / "root" - model_root.mkdir() - - for i in range(2): - root_dir = model_root / f"d{i}" - root_dir.mkdir() - for j in range(3): - dir_file = root_dir / f"f{i}{j}" - dir_file.write_text(f"This is file f{i}{j} in d{i}.") - - for i in range(4): - root_file = model_root / f"f{i}" - root_file.write_text(f"This is file f{i} in root.") - - return model_root - - -@pytest.fixture -def empty_model_folder(tmp_path_factory): - model_root = tmp_path_factory.mktemp("model") / "root" - model_root.mkdir() - return model_root - - -@pytest.fixture -def deep_model_folder(tmp_path_factory): - model_root = tmp_path_factory.mktemp("model") / "root" - model_root.mkdir() - - current = model_root - for i in range(5): - current = current / f"d{i}" - current.mkdir() - - for i in range(4): - file = current / f"f{i}" - file.write_text(f"This is file f{i}.") - - return model_root - - -class TestDFSSerializer: - - def test_known_file(self, sample_model_file): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(sample_model_file) - expected = ( - "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" - ) - assert manifest.digest.digest_hex == expected - - def test_file_hash_is_same_as_hash_of_content(self, sample_model_file): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(sample_model_file) - digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() - assert manifest.digest.digest_hex == digest.digest_hex - - def test_file_model_hash_is_same_if_model_is_moved(self, sample_model_file): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(sample_model_file) - - new_name = sample_model_file.with_name("new-file") - new_file = sample_model_file.rename(new_name) - new_manifest = serializer.serialize(new_file) - - assert manifest == new_manifest - - def test_file_model_hash_changes_if_content_changes( - self, sample_model_file - ): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(sample_model_file) - - sample_model_file.write_bytes(_ANOTHER_MODEL_TEXT) - new_manifest = serializer.serialize(sample_model_file) - - assert manifest.digest.algorithm == new_manifest.digest.algorithm - assert manifest.digest.digest_value != new_manifest.digest.digest_value - - def test_directory_model_with_only_known_file(self, sample_model_file): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - - model = sample_model_file.parent - manifest = serializer.serialize(model) - - expected = ( - "a0865eb7e299e3bca3951e24930c56dcf1533ecff63bda06a9be67906773c628" - ) - assert manifest.digest.digest_hex == expected - - digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute() - assert manifest.digest.digest_hex != digest.digest_hex - - def test_known_folder(self, sample_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(sample_model_folder) - expected = ( - "310af4fc4c52bf63cd1687c67076ed3e56bc5480a1b151539e6c550506ae0301" - ) - assert manifest.digest.digest_hex == expected - - def test_folder_model_hash_is_same_if_model_is_moved( - self, sample_model_folder - ): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(sample_model_folder) - - new_name = sample_model_folder.with_name("new-root") - new_model = sample_model_folder.rename(new_name) - new_manifest = serializer.serialize(new_model) - - assert manifest == new_manifest - - def test_empty_file(self, empty_model_file): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(empty_model_file) - expected = ( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - ) - assert manifest.digest.digest_hex == expected - - def test_directory_model_with_only_empty_file(self, empty_model_file): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(empty_model_file) - model = empty_model_file.parent - manifest = serializer.serialize(model) - expected = ( - "8a587b2129fdecfbea38d5152b626299f5994d9b99d36b321aea356f69b38c61" - ) - assert manifest.digest.digest_hex == expected - - def test_empty_folder(self, empty_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(empty_model_folder) - expected = ( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - ) - assert manifest.digest.digest_hex == expected - - def test_empty_folder_hashes_the_same_as_empty_file( - self, empty_model_file, empty_model_folder - ): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - folder_manifest = serializer.serialize(empty_model_folder) - file_manifest = serializer.serialize(empty_model_file) - assert ( - folder_manifest.digest.digest_hex == file_manifest.digest.digest_hex - ) - - def test_folder_model_empty_entry(self, sample_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - - # Alter first directory within the model - dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] - altered_dir = dirs[0] - - new_empty_dir = altered_dir / "empty" - new_empty_dir.mkdir() - manifest1 = serializer.serialize(sample_model_folder) - - new_empty_dir.rmdir() - - new_empty_file = altered_dir / "empty" - new_empty_file.write_text("") - manifest2 = serializer.serialize(sample_model_folder) - - assert manifest1.digest != manifest2.digest - - def test_folder_model_rename_file(self, sample_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest1 = serializer.serialize(sample_model_folder) - - # Alter first directory within the model - dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] - altered_dir = dirs[0] - - # Alter first file in the altered_dir - files = [f for f in altered_dir.iterdir() if f.is_file()] - file_to_rename = files[0] - - new_name = file_to_rename.with_name("new-file") - file_to_rename.rename(new_name) - - manifest2 = serializer.serialize(sample_model_folder) - assert manifest1.digest != manifest2.digest - - def test_folder_model_rename_dir(self, sample_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest1 = serializer.serialize(sample_model_folder) - - # Alter first directory within the model - dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] - dir_to_rename = dirs[0] - - new_name = dir_to_rename.with_name("new-dir") - dir_to_rename.rename(new_name) - - manifest2 = serializer.serialize(sample_model_folder) - assert manifest1.digest != manifest2.digest - - def test_folder_model_replace_file_empty_folder(self, sample_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest1 = serializer.serialize(sample_model_folder) - - # Alter first directory within the model - dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] - altered_dir = dirs[0] - - # Replace first file in the altered_dir - files = [f for f in altered_dir.iterdir() if f.is_file()] - file_to_replace = files[0] - file_to_replace.unlink() - file_to_replace.mkdir() - - manifest2 = serializer.serialize(sample_model_folder) - assert manifest1.digest != manifest2.digest - - def test_folder_model_change_file(self, sample_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest1 = serializer.serialize(sample_model_folder) - - # Alter first directory within the model - dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()] - altered_dir = dirs[0] - - # Alter first file in the altered_dir - files = [f for f in altered_dir.iterdir() if f.is_file()] - file_to_change = files[0] - file_to_change.write_bytes(_KNOWN_MODEL_TEXT) - - manifest2 = serializer.serialize(sample_model_folder) - assert manifest1.digest != manifest2.digest - - def test_deep_folder(self, deep_model_folder): - file_hasher = file.FileHasher("unused", memory.SHA256()) - serializer = dfs.DFSSerializer(file_hasher, memory.SHA256) - manifest = serializer.serialize(deep_model_folder) - expected = ( - "36eed9389ebbbe15ac15d33c81dabb60ccb7c945ff641d78f59db9aa9dc47ac9" - ) - assert manifest.digest.digest_hex == expected