Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

colocate __main__.py with the central directory record of the zipapp #2209

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions pex/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from datetime import datetime
from uuid import uuid4

from pex.orderedset import OrderedSet
from pex.typing import TYPE_CHECKING, cast

if TYPE_CHECKING:
Expand Down Expand Up @@ -621,11 +622,18 @@ def zip(
# type: (...) -> None

if labels:
selected_files = set(
itertools.chain.from_iterable(self.filesets.get(label, ()) for label in labels)
selected_files = OrderedSet(
itertools.chain.from_iterable(
# If labels are provided, respect the given ordering, but still sort the files
# within each label to get deterministic output.
sorted(self.filesets.get(label, ()))
# NB: An iterable of labels with non-deterministic order is not reproducible!
for label in labels
)
)
else:
selected_files = self.files()
# Otherwise, sort the files to get reproducible output by default.
selected_files = OrderedSet(sorted(self.files()))

compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED
with open_zip(filename, mode, compression) as zf:
Expand Down Expand Up @@ -667,7 +675,7 @@ def maybe_write_parent_dirs(path):

def iter_files():
# type: () -> Iterator[Tuple[str, str]]
for path in sorted(selected_files):
for path in selected_files:
full_path = os.path.join(self.chroot, path)
if os.path.isfile(full_path):
if exclude_file(full_path):
Expand Down
111 changes: 101 additions & 10 deletions pex/pex_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import absolute_import

import hashlib
import itertools
import logging
import os
import shutil
Expand Down Expand Up @@ -39,7 +40,7 @@
from pex.util import CacheHelper

if TYPE_CHECKING:
from typing import Dict, Optional
from typing import ClassVar, Dict, Iterable, Iterator, Optional, Tuple


# N.B.: __file__ will be relative when this module is loaded from a "" `sys.path` entry under
Expand Down Expand Up @@ -95,14 +96,14 @@ def __maybe_run_venv__(pex, pex_root, pex_path):

venv_dir = venv_dir(
pex_file=pex,
pex_root=pex_root,
pex_root=pex_root,
pex_hash={pex_hash!r},
has_interpreter_constraints={has_interpreter_constraints!r},
pex_path=pex_path,
)
venv_pex = os.path.join(venv_dir, 'pex')
if not __execute__ or not is_exe(venv_pex):
# Code in bootstrap_pex will (re)create the venv after selecting the correct interpreter.
# Code in bootstrap_pex will (re)create the venv after selecting the correct interpreter.
return venv_dir

TRACER.log('Executing venv PEX for {{}} at {{}}'.format(pex, venv_pex))
Expand Down Expand Up @@ -702,6 +703,59 @@ def set_sh_boot_script(
)
self.set_header(script)

@classmethod
def iter_bootstrap_script_labels(cls):
# type: () -> Iterator[str]
"""``Chroot`` labels covering the scripts immediately executed by the python interpreter."""
# __pex__/__init__.py: This version of the bootstrap script will be executed by the python
# interpreter if the zipapp is on the PYTHONPATH.
yield "importhook"

# __main__.py: This version of the bootstrap script is what is first executed by the python
# interpreter for an executable entry point (which includes use as a REPL).
yield "main"

@classmethod
def iter_metadata_labels(cls):
# type: () -> Iterator[str]
"""``Chroot`` labels covering metadata files."""
# PEX-INFO
yield "manifest"

@classmethod
def iter_bootstrap_libs_labels(cls):
# type: () -> Iterator[str]
"""``Chroot`` labels covering code that may be imported from the bootstrap scripts."""
# .bootstrap/
yield "bootstrap"

@classmethod
def iter_deps_libs_labels(cls, pex_info):
# type: (PexInfo) -> Iterator[str]
"""``Chroot`` labels covering the third-party code that was resolved into dists."""
# Subdirectories of .deps/: Keys need to be sorted for deterministic output.
for dist_label in sorted(pex_info.distributions.keys()):
yield dist_label

@classmethod
def iter_direct_source_labels(cls):
# type: () -> Iterator[str]
"""User source/resource files."""
# Deprecated.
yield "resource"

# Source files from -D/--sources-directory.
yield "source"

# The value of --exe, if provided. Accessed after unpacking the zip.
yield "executable"

def _setup_pex_info(self):
# type: () -> PexInfo
pex_info = self._pex_info.copy()
pex_info.update(PexInfo.from_env())
return pex_info

def _build_packedapp(
self,
dirname, # type: str
Expand All @@ -710,15 +764,20 @@ def _build_packedapp(
):
# type: (...) -> None

pex_info = self._pex_info.copy()
pex_info.update(PexInfo.from_env())
pex_info = self._setup_pex_info()

# Include user sources, PEX-INFO and __main__ as loose files in src/.
for fileset in ("executable", "importhook", "main", "manifest", "resource", "source"):
for f in self._chroot.filesets.get(fileset, ()):
dest = os.path.join(dirname, f)
safe_mkdir(os.path.dirname(dest))
safe_copy(os.path.realpath(os.path.join(self._chroot.chroot, f)), dest)
with TRACER.timed("copying over uncached sources", V=9):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, this seems fine - but boy oh boy does it seem incredibly complicated vs just ordering the labels correctly in the existing tuple on the LHS and writing a good block comment just above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, and this change could be reduced quite a bit by providing the labels literally (for example, we could avoid changing this method at all). I thought it was useful to document the meanings of each label, since they're not really documented elsewhere unless you grep for the literal strings "executable", "importhook", etc. I was also thinking that naming each section with a classmethod would make it easier for someone to subclass PEXBuilder and insert their own files into the appropriate section while retaining the performance benefit for zipapps. If that's not a supported use case, we could very well reduce the size of this change by a lot.

uncached_label_groups = [
self.iter_direct_source_labels(),
self.iter_metadata_labels(),
self.iter_bootstrap_script_labels(),
]
for fileset in itertools.chain.from_iterable(uncached_label_groups):
for f in self._chroot.filesets.get(fileset, ()):
dest = os.path.join(dirname, f)
safe_mkdir(os.path.dirname(dest))
safe_copy(os.path.realpath(os.path.join(self._chroot.chroot, f)), dest)

# Pex historically only supported compressed zips in packed layout, so we don't disturb the
# old cache structure for those zips and instead just use a subdir for un-compressed zips.
Expand Down Expand Up @@ -779,6 +838,37 @@ def zip_cache_dir(path):
os.path.join(internal_cache, location),
)

@classmethod
def iter_zipapp_labels(cls, pex_info):
# type: (PexInfo) -> Iterator[str]
"""User sources, bootstrap sources, metadata, and dependencies, optimized for tail access.

This method returns all the ``Chroot`` labels used by this ``PEXBuilder``. For zipapp
layouts, the order is significant. The python interpreter from the shebang line will process
the zipapp PEX by seeking to and parsing its central directory records at the end of the
file first. This method orders labels so the files which are first accessed by the python
interpreter will be closest to the location of the file pointer in the interpreter's file
handle for this zipapp after processing those central directory records.
"""
label_groups = [
cls.iter_deps_libs_labels(pex_info),
cls.iter_direct_source_labels(),
# As these may also be imported before unzipping the zipapp, it should be as close to
# end of the file as possible.
cls.iter_bootstrap_libs_labels(),
# Metadata files such as PEX-INFO may be accessed outside the context of an executable
# zipapp, such as with ``PexInfo.from_pex()`` and ``unzip -p pex.pex PEX-INFO``.
# While they are only accessed after unpacking to the ``"unzipped_pexes"`` cache during
# execution, these should still be near enough to the central directory record to
# minimize the seek distance to then access these files in non-execution scenarios.
cls.iter_metadata_labels(),
# The following bootstrap scripts are executed immediately after opening the zipapp, so
# they should be right near the end by the central directory records.
cls.iter_bootstrap_script_labels(),
]
for label in itertools.chain.from_iterable(label_groups):
yield label

def _build_zipapp(
self,
filename, # type: str
Expand All @@ -805,5 +895,6 @@ def _build_zipapp(
# racy.
exclude_file=is_pyc_temporary_file,
compress=compress,
labels=list(self.iter_zipapp_labels(self._setup_pex_info())),
)
chmod_plus_x(filename)
3 changes: 3 additions & 0 deletions tests/integration/test_reproducible.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ def explode_pex(path):
), "{} and {} have different content.".format(member1, member2)
# Check that the entire file is equal, including metadata.
assert filecmp.cmp(member1, member2, shallow=False)
# Check that the file list is identical.
with ZipFile(pex1) as zfp1, ZipFile(pex2) as zfp2:
assert zfp1.namelist() == zfp2.namelist()
# Finally, check that the .pex files are byte-for-byte identical.
assert filecmp.cmp(pex1, pex2, shallow=False)

Expand Down
26 changes: 25 additions & 1 deletion tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import contextlib
import errno
import os
import uuid

import pytest

Expand All @@ -27,7 +28,7 @@
import mock # type: ignore[no-redef,import]

if TYPE_CHECKING:
from typing import Any, Iterator, Tuple
from typing import Any, Iterator, List, Tuple


def extract_perms(path):
Expand Down Expand Up @@ -140,6 +141,29 @@ def test_chroot_zip():
assert b"data" == zip.read("directory/subdirectory/file")


def test_chroot_zip_preserves_label_order():
# type: () -> None
"""Verify that the order of the labels presented to ``Chroot.zip()`` transfers to the order
of the zip file entries."""
with temporary_dir() as tmp:
chroot = Chroot(os.path.join(tmp, "chroot"))

ordered_labels = [] # type: List[str]
file_todo = [] # type: List[Tuple[str, str]]
for i in range(10):
label = uuid.uuid4().hex
file_todo.append(("{}-{}".format(uuid.uuid4().hex, i), label))
ordered_labels.append(label)
for filename, label in reversed(file_todo):
chroot.touch(filename, label=label)

zip_dst = os.path.join(tmp, "chroot.zip")
chroot.zip(zip_dst, labels=ordered_labels)
with open_zip(zip_dst) as zip:
for i, filename in enumerate(zip.namelist()):
assert filename.endswith("-{}".format(i))


def test_chroot_zip_symlink():
# type: () -> None
with temporary_dir() as tmp:
Expand Down
59 changes: 51 additions & 8 deletions tests/test_pex_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,22 +455,35 @@ def assert_pex_env_var_nested(**env):
assert_pex_env_var_nested(PEX_VENV=True)


@pytest.fixture(scope="function")
def tmp_pex_root_pex_builder(tmpdir):
# type: (Any) -> Iterator[PEXBuilder]
pex_root = os.path.join(str(tmpdir), "pex_root")
pb = PEXBuilder()
pb.info.pex_root = pex_root
yield pb


@pytest.fixture(scope="function")
def pex_dist_pex_builder(tmp_pex_root_pex_builder, pex_project_dir):
# type: (PEXBuilder, str) -> Iterator[PEXBuilder]
pb = tmp_pex_root_pex_builder
with ENV.patch(PEX_ROOT=pb.info.pex_root):
pb.add_dist_location(install_wheel(WheelBuilder(pex_project_dir).bdist()).location)
yield pb


@pytest.mark.parametrize(
"layout", [pytest.param(layout, id=layout.value) for layout in (Layout.PACKED, Layout.ZIPAPP)]
)
def test_build_compression(
tmpdir, # type: Any
layout, # type: Layout.Value
pex_project_dir, # type: str
pex_dist_pex_builder, # type: PEXBuilder
):
# type: (...) -> None
pb = pex_dist_pex_builder

pex_root = os.path.join(str(tmpdir), "pex_root")

pb = PEXBuilder()
pb.info.pex_root = pex_root
with ENV.patch(PEX_ROOT=pex_root):
pb.add_dist_location(install_wheel(WheelBuilder(pex_project_dir).bdist()).location)
exe = os.path.join(str(tmpdir), "exe.py")
with open(exe, "w") as fp:
fp.write("import pex; print(pex.__file__)")
Expand All @@ -479,7 +492,9 @@ def test_build_compression(
def assert_pex(pex):
# type: (str) -> None
assert (
subprocess.check_output(args=[sys.executable, pex]).decode("utf-8").startswith(pex_root)
subprocess.check_output(args=[sys.executable, pex])
.decode("utf-8")
.startswith(pb.info.pex_root)
)

compressed_pex = os.path.join(str(tmpdir), "compressed.pex")
Expand All @@ -499,3 +514,31 @@ def size(pex):
)

assert size(compressed_pex) < size(uncompressed_pex)


def test_bootstrap_at_end_of_zipapp(tmpdir, pex_dist_pex_builder):
# type: (Any, PEXBuilder) -> None
pb = pex_dist_pex_builder

src_file = os.path.join(str(tmpdir), "src.py")
with open(src_file, mode="w") as fp:
fp.write("print('hello')")
pb.add_source(src_file, "src.py")

exe = os.path.join(str(tmpdir), "exe.py")
with open(exe, "w") as fp:
fp.write("import pex; print(pex.__file__)")
pb.set_executable(exe)

compressed_pex = os.path.join(str(tmpdir), "test.pex")
pb.build(compressed_pex, layout=Layout.ZIPAPP)

with open_zip(compressed_pex, mode="r") as zf:
names = list(reversed(zf.namelist()))
assert names[0:4] == [
"__main__.py",
"__pex__/__init__.py",
"__pex__/",
"PEX-INFO",
]
assert names[4].startswith(".bootstrap/") and names[4].endswith(".py")