From a75424e162a7fc4ee12a60b3819c314266ccf97e Mon Sep 17 00:00:00 2001 From: Jay Qi <2721979+jayqi@users.noreply.github.com> Date: Mon, 8 Jan 2024 12:21:30 -0500 Subject: [PATCH] Overwrite permissions with fixed values (#6) * Overwrite permissions * Use copy to not mutate passed in ZipInfo * Update documentation * Update comment * Add tests for environment variables * Fix lint * Skip mode check for regular ZipFile on Windows * Change date * Note about 0o prefix * Add example value --------- Co-authored-by: Jay Qi --- CHANGELOG.md | 5 +++ README.md | 27 ++++++++++-- repro_zipfile.py | 75 +++++++++++++++++++++++++++++---- tests/test_core.py | 101 +++++++++++++++++++++++++++++++++++++++++++-- tests/utils.py | 10 +++++ 5 files changed, 201 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82ddf52..3abb130 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## v0.2.0 (2024-01-08) + +- Changed `ReproducibleZipFile` to also overwrite file-system permissions with fixed values. These default to `0o644` (`rw-r--r--`) for files and `0o755` (`rwxr-xr-x`) for directories. +- Added support for `REPRO_ZIPFILE_FILE_MODE` and `REPRO_ZIPFILE_DIR_MODE` environment variables for overriding the fixed file and directory permission values. + ## v0.1.0 (2023-08-12) Initial release! 🎉 diff --git a/README.md b/README.md index 2bfc216..7c57bdf 100644 --- a/README.md +++ b/README.md @@ -45,13 +45,16 @@ Note that files must be written to the archive in the same order to reproduce an See [`examples/usage.py`](./examples/usage.py) for an example script that you can run, and [`examples/demo_vs_zipfile.py`](./examples/demo_vs_zipfile.py) for a demonstration in contrast with the standard library's zipfile module. -### Set timestamp value with SOURCE_DATE_EPOCH - -repro_zipfile supports the `SOURCE_DATE_EPOCH` environment variable. If set, it will be used as a fixed value for the modified timestamps of files added to an archive. This should be an integer corresponding to the [Unix epoch time](https://en.wikipedia.org/wiki/Unix_time) of the timestamp you want to set. `SOURCE_DATE_EPOCH` is a [standard](https://reproducible-builds.org/docs/source-date-epoch/) created by the [Reproducible Builds project](https://reproducible-builds.org/) for software distributions. +For more advanced usage, such as customizing the fixed metadata values, see the following section. ## How does repro-zipfile work? -The primary reason that ZIP archives aren't automatically reproducible is because they include last-modified timestamps of files. This means that files with identical content but with different last-modified times cause the resulting ZIP archive to be different. `repro_zipfile.ReproducibleZipFile` is a subclass of `zipfile.ZipFile` that overrides the `write` and `writestr` methods to set the modified timestamp of all files written to the archive to a fixed value. By default, this value is 1980-01-01 0:00 UTC, which is the earliest timestamp that is supported by the ZIP format. You can customize this value as documented in the previous section. Note that repro-zipfile does not modify the original files—only the metadata written to the archive. +ZIP archives are not normally reproducible even when containing files with identical content because of file metadata. In particular, the usual culprits are: + +1. Last-modified timestamps +2. File-system permissions (mode) + +`repro_zipfile.ReproducibleZipFile` is a subclass of `zipfile.ZipFile` that overrides the `write`, `writestr`, and `mkdir` methods with versions that set the above metadata to fixed values. Note that repro-zipfile does not modify the original files—only the metadata written to the archive. You can effectively reproduce what `ReproducibleZipFile` does with something like this: @@ -63,14 +66,30 @@ with ZipFile("archive.zip", "w") as zp: zp.write("examples/data.txt", arcname="data.txt") zinfo = zp.getinfo("data.txt") zinfo.date_time = (1980, 1, 1, 0, 0, 0) + zinfo.external_attr = 0o644 << 16 # Or writestr to write data to the archive zp.writestr("lore.txt", data="goodbye") zinfo = zp.getinfo("lore.txt") zinfo.date_time = (1980, 1, 1, 0, 0, 0) + zinfo.external_attr = 0o644 << 16 ``` It's not hard to do, but we believe `ReproducibleZipFile` is sufficiently more convenient to justify a small package! +See the next two sections for more details about the replacement metadata values and how to customize them. + +### Last-modified timestamps + +ZIP archives store the last-modified timestamps of files and directories. `ReproducibleZipFile` will set this to a fixed value. By default, the fixed value is 1980-01-01 00:00 UTC, which is the earliest timestamp that is supported by the ZIP format specifications. + +You can customize this value with the `SOURCE_DATE_EPOCH` environment variable. If set, it will be used as the fixed value instead. This should be an integer corresponding to the [Unix epoch time](https://en.wikipedia.org/wiki/Unix_time) of the timestamp you want to set, e.g., `1704067230` for 2024-01-01 00:00:00 UTC. `SOURCE_DATE_EPOCH` is a [standard](https://reproducible-builds.org/docs/source-date-epoch/) created by the [Reproducible Builds project](https://reproducible-builds.org/) for software distributions. + +### File-system permissions + +ZIP archives store the file-system permissions of files and directories. The default permissions set for new files or directories often can be different across different systems or users without any intentional choices being made. (These default permissions are controlled by something called [`umask`](https://en.wikipedia.org/wiki/Umask).) `ReproducibleZipFile` will set these to fixed values. By default, the fixed values are `0o644` (`rw-r--r--`) for files and `0o755` (`rwxr-xr-x`) for directories, which matches the common default `umask` of `0o022` for root users on Unix systems. (The [`0o` prefix](https://docs.python.org/3/reference/lexical_analysis.html#integers) is how you can write an octal—i.e., base 8—integer literal in Python.) + +You can customize these values using the environment variables `REPRO_ZIPFILE_FILE_MODE` and `REPRO_ZIPFILE_DIR_MODE`. They should be in three-digit octal [Unix numeric notation](https://en.wikipedia.org/wiki/File-system_permissions#Numeric_notation), e.g., `644` for `rw-r--r--`. + ## Why care about reproducible ZIP archives? ZIP archives are often useful when dealing with a set of multiple files, especially if the files are large and can be compressed. Creating reproducible ZIP archives is often useful for: diff --git a/repro_zipfile.py b/repro_zipfile.py index aada1df..c2e1bfc 100644 --- a/repro_zipfile.py +++ b/repro_zipfile.py @@ -11,23 +11,52 @@ except ImportError: _MASK_COMPRESS_OPTION_1 = 0x02 -__version__ = "0.1.0" +__version__ = "0.2.0" def date_time() -> Union[time.struct_time, Tuple[int, int, int, int, int, int]]: + """Returns date_time value used to force overwrite on all ZipInfo objects. Defaults to + 1980-01-01 00:00:00. You can set this with the environment variable SOURCE_DATE_EPOCH as an + integer value representing seconds since Epoch. + """ source_date_epoch = os.environ.get("SOURCE_DATE_EPOCH", None) if source_date_epoch is not None: return time.localtime(int(source_date_epoch)) return (1980, 1, 1, 0, 0, 0) +def file_mode() -> int: + """Returns the file permissions mode value used to force overwrite on all ZipInfo objects. + Defaults to 0o644 (rw-r--r--). You can set this with the environment variable + REPRO_ZIPFILE_FILE_MODE. It should be in the Unix standard three-digit octal representation + (e.g., '644'). + """ + file_mode_env = os.environ.get("REPRO_ZIPFILE_FILE_MODE", None) + if file_mode_env is not None: + return int(file_mode_env, 8) + return 0o644 + + +def dir_mode() -> int: + """Returns the directory permissions mode value used to force overwrite on all ZipInfo objects. + Defaults to 0o755 (rwxr-xr-x). You can set this with the environment variable + REPRO_ZIPFILE_DIR_MODE. It should be in the Unix standard three-digit octal representation + (e.g., '755'). + """ + dir_mode_env = os.environ.get("REPRO_ZIPFILE_DIR_MODE", None) + if dir_mode_env is not None: + return int(dir_mode_env, 8) + return 0o755 + + class ReproducibleZipFile(ZipFile): """Open a ZIP file, where file can be a path to a file (a string), a file-like object or a path-like object. - This is a replacement for the Python standard library zipfile.ZipFile that - overwrites file-modified timestamps in write mode in order to create a reproducible ZIP - archive. For documentation on use, see the Python documentation for zipfile: + This is a replacement for the Python standard library zipfile.ZipFile that overwrites + file-modified timestamps and file/directory permissions modes in write mode in order to create + a reproducible ZIP archive. Other than overwriting these values, it works the same way as + zipfile.ZipFile. For documentation on use, see the Python documentation for zipfile: https://docs.python.org/3/library/zipfile.html """ @@ -44,7 +73,17 @@ def write(self, filename, arcname=None, compress_type=None, compresslevel=None): raise ValueError("Can't write to ZIP archive while an open writing handle exists") zinfo = ZipInfo.from_file(filename, arcname, strict_timestamps=self._strict_timestamps) - zinfo.date_time = date_time() # ADDED + + ## repro-zipfile ADDED ## + # Overwrite date_time and extrnal_attr (permissions mode) + zinfo = copy(zinfo) + zinfo.date_time = date_time() + if zinfo.is_dir(): + zinfo.external_attr = (0o40000 | dir_mode()) << 16 + zinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + zinfo.external_attr = file_mode() << 16 + ######################### if zinfo.is_dir(): zinfo.compress_size = 0 @@ -75,7 +114,7 @@ def writestr(self, zinfo_or_arcname, data, compress_type=None, compresslevel=Non if isinstance(data, str): data = data.encode("utf-8") if not isinstance(zinfo_or_arcname, ZipInfo): - zinfo = ZipInfo(filename=zinfo_or_arcname, date_time=date_time()) # CHANGED + zinfo = ZipInfo(filename=zinfo_or_arcname, date_time=time.localtime(time.time())[:6]) zinfo.compress_type = self.compression zinfo._compresslevel = self.compresslevel if zinfo.filename.endswith("/"): @@ -84,8 +123,18 @@ def writestr(self, zinfo_or_arcname, data, compress_type=None, compresslevel=Non else: zinfo.external_attr = 0o600 << 16 # ?rw------- else: - zinfo = copy(zinfo_or_arcname) # CHANGED - zinfo.date_time = date_time() # ADDED + zinfo = zinfo_or_arcname + + ## repro-zipfile ADDED ## + # Overwrite date_time and extrnal_attr (permissions mode) + zinfo = copy(zinfo) + zinfo.date_time = date_time() + if zinfo.is_dir(): + zinfo.external_attr = (0o40000 | dir_mode()) << 16 + zinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + zinfo.external_attr = file_mode() << 16 + ######################### if not self.fp: raise ValueError("Attempt to write to ZIP archive that was already closed") @@ -104,7 +153,7 @@ def writestr(self, zinfo_or_arcname, data, compress_type=None, compresslevel=Non dest.write(data) if sys.version_info < (3, 11): - # Following method copied from Python 3.11 + # Following method modified from Python 3.11 # https://github.com/python/cpython/blob/202efe1a3bcd499f3bf17bd953c6d36d47747e78/Lib/zipfile.py#L1837-L1870 # Copyright Python Software Foundation, licensed under PSF License Version 2 # See LICENSE file for full license agreement and notice of copyright @@ -127,6 +176,14 @@ def mkdir(self, zinfo_or_directory_name, mode=511): else: raise TypeError("Expected type str or ZipInfo") + ## repro-zipfile ADDED ## + # Overwrite date_time and extrnal_attr (permissions mode) + zinfo = copy(zinfo) + zinfo.date_time = date_time() + zinfo.external_attr = (0o40000 | dir_mode()) << 16 + zinfo.external_attr |= 0x10 # MS-DOS directory flag + ######################### + with self._lock: if self._seekable: self.fp.seek(self.start_dir) diff --git a/tests/test_core.py b/tests/test_core.py index 2e0c53e..44f8af3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,3 +1,4 @@ +import platform from time import sleep from zipfile import ZipFile, ZipInfo @@ -8,11 +9,12 @@ dir_tree_factory, file_factory, hash_file, + umask, ) -def test_write_dir_tree(base_path): - """Archiving a directory tree works.""" +def test_write_dir_tree_mtime(base_path): + """Archiving a directory tree works with different modified time.""" dir_tree = dir_tree_factory(base_path) # Create base ReproducibleZipFile archive @@ -27,7 +29,7 @@ def test_write_dir_tree(base_path): for path in sorted(dir_tree.glob("**/*")): zp.write(path) - # Update modified times + # Sleep to update modified times, change permissions sleep(2) for path in dir_tree.glob("**/*"): path.touch() @@ -54,6 +56,56 @@ def test_write_dir_tree(base_path): assert hash_file(zipfile_arc1) != hash_file(zipfile_arc2) +def test_write_dir_tree_mode(base_path): + """Archiving a directory tree works with different permission modes.""" + with umask(0o022): + dir_tree = dir_tree_factory(base_path) + + # Create base ReproducibleZipFile archive + repro_zipfile_arc1 = base_path / "repro_zipfile1.zip" + with ReproducibleZipFile(repro_zipfile_arc1, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # Create regular ZipFile archive for comparison + zipfile_arc1 = base_path / "zipfile1.zip" + with ZipFile(zipfile_arc1, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # Change permissions + with umask(0o002) as mask: + dir_tree.chmod(mode=0o777 ^ mask) + for path in dir_tree.glob("**/*"): + if path.is_file(): + path.chmod(mode=0o666 ^ mask) + else: + path.chmod(mode=0o777 ^ mask) + + # Create second ReproducibleZipFile archive after delay + repro_zipfile_arc2 = base_path / "repro_zipfile2.zip" + with ReproducibleZipFile(repro_zipfile_arc2, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # Create second regular ZipFile archive for comparison after delay + zipfile_arc2 = base_path / "zipfile2.zip" + with ZipFile(zipfile_arc2, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # All four archives should have identical content + assert_archive_contents_equals(repro_zipfile_arc1, zipfile_arc1) + assert_archive_contents_equals(repro_zipfile_arc1, repro_zipfile_arc2) + assert_archive_contents_equals(repro_zipfile_arc1, zipfile_arc2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zipfile_arc1) == hash_file(repro_zipfile_arc2) + if platform.system() != "Windows": + # Windows doesn't seem to actually make them different + assert hash_file(zipfile_arc1) != hash_file(zipfile_arc2) + + def test_write_dir_tree_string_paths(rel_path): """Archiving a directory tree works.""" dir_tree = dir_tree_factory(rel_path) @@ -132,7 +184,7 @@ def test_write_single_file(base_path): assert hash_file(zip1) != hash_file(zip2) -def test_write_single_file_with_source_date_epoch(base_path, monkeypatch): +def test_write_single_file_source_date_epoch(base_path, monkeypatch): """Writing the same file with different mtime with SOURCE_DATE_EPOCH set produces the same hash.""" @@ -165,6 +217,26 @@ def test_write_single_file_with_source_date_epoch(base_path, monkeypatch): assert hash_file(arc_sde1) == hash_file(arc_sde2) +def test_write_single_file_file_mode_env_var(rel_path, monkeypatch): + """REPRO_ZIPFILE_FILE_MODE environment variable works.""" + + with umask(0o002): + # Expect 664 + data_file = file_factory(rel_path) + + monkeypatch.setenv("REPRO_ZIPFILE_FILE_MODE", "600") # rw------- + + arc_path = rel_path / "archive.zip" + with ReproducibleZipFile(arc_path, "w") as zp: + zp.write(data_file) + + with ZipFile(arc_path, "r") as zp: + print(zp.infolist()) + mode = (zp.getinfo(data_file.name).external_attr >> 16) & 0o777 + + assert mode == 0o600, (oct(mode), oct(0o600)) + + def test_write_single_file_string_paths(rel_path): """Writing the same file with different mtime produces the same hash, using string inputs instead of Path.""" @@ -234,6 +306,27 @@ def test_write_single_file_arcname(base_path): assert hash_file(zip1) != hash_file(zip2) +def test_write_single_dir_dir_mode_env_var(rel_path, monkeypatch): + """REPRO_ZIPFILE_DIR_MODE environment variable works.""" + + with umask(0o002): + # Expect 775 + dir_path = rel_path / data_factory() + dir_path.mkdir() + + monkeypatch.setenv("REPRO_ZIPFILE_DIR_MODE", "700") # rwx------ + + arc_path = rel_path / "archive.zip" + with ReproducibleZipFile(arc_path, "w") as zp: + zp.write(dir_path) + + with ZipFile(arc_path, "r") as zp: + print(zp.infolist()) + mode = (zp.getinfo(dir_path.name + "/").external_attr >> 16) & 0o777 + + assert mode == 0o700, (oct(mode), oct(0o700)) + + def test_writestr(tmp_path): """writestr works as expected""" data = data_factory() diff --git a/tests/utils.py b/tests/utils.py index 067c589..5fbaf1e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,6 @@ +from contextlib import contextmanager import hashlib +import os from pathlib import Path from tempfile import TemporaryDirectory from zipfile import ZipFile @@ -43,6 +45,14 @@ def dir_tree_factory(parent_dir: Path): return root_dir +@contextmanager +def umask(mask: int): + """Utility context manager to temporarily set umask to a new value.""" + old_mask = os.umask(mask) + yield mask + os.umask(old_mask) + + def hash_file(path: Path): """Utility function to calculate the hash of a file's contents.""" return hashlib.md5(path.read_bytes()).hexdigest()