From 0e9aee8da0295720b6ee60174692502c540ce257 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 3 Oct 2022 09:31:23 -0400 Subject: [PATCH] lazier lazy_wheel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - handle short files `416`; prefetch entire dist-info - lazy_wheel: translate BadZipfile to InvalidWheel - handle 501 on negative bytes range from pypi - catch UnsupportedWheel - make lazy wheel work against tensorflow-gpu - link to pypi discussion on negative byte ranges - check for case when server doesn't support byte ranges at all - remove _check_zip() method since we do that in prefetch_dist_info() now - clean up error handling code and don't warn when negative ranges fail - remove TODO, as Cache-Control: no-cache is the correct behavior - rearrange explanatory comments - specify how we handle 200 OK and interpret 405 as no range requests - rename LazyZipOverHTTP to LazyWheelOverHTTP because of the assumption of the structure of *.dist-info/ - stream the initial chunk request too, since it's 1MB - add note about fast-deps being on by default - add testing for request limiting - fix download metadata testing - factor out the laziness from the wheel-specific logic - factor out LazyRemoteResource from FixedSizeLazyResource - add metadata_first=True arg to make_wheel - reduce initial chunk size to 10K - remove hardcoded compilewheel and instead generate wheels - catch new requests decoding error - support NegativeRangeOverflowing - support SneakilyCoerceNegativeRange - time out test server thread joining within 3 seconds Co-authored-by: Tzu-ping Chung - make FakePackageSource to abstract over generated and hardcoded whls - ensure InvalidWheel retains context from inner errors Co-authored-by: Randy Döring <30527984+radoering@users.noreply.github.com> - add link to perf evaluation from radoering --- news/12208.feature.rst | 2 + src/pip/_internal/exceptions.py | 6 +- src/pip/_internal/network/lazy_wheel.py | 734 ++++++++++++++++++++---- src/pip/_internal/operations/prepare.py | 19 +- src/pip/_internal/utils/wheel.py | 4 + tests/conftest.py | 405 ++++++++++++- tests/functional/test_download.py | 4 +- tests/functional/test_fast_deps.py | 215 ++++++- tests/lib/__init__.py | 2 + tests/lib/wheel.py | 50 +- 10 files changed, 1298 insertions(+), 143 deletions(-) create mode 100644 news/12208.feature.rst diff --git a/news/12208.feature.rst b/news/12208.feature.rst new file mode 100644 index 00000000000..a4c298505a5 --- /dev/null +++ b/news/12208.feature.rst @@ -0,0 +1,2 @@ +Implement lazier lazy_wheel that avoids the HEAD request, fetching the metadata +for many wheels in 1 request. diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 2587740f73a..fd62175ddae 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -323,12 +323,14 @@ class UnsupportedWheel(InstallationError): class InvalidWheel(InstallationError): """Invalid (e.g. corrupt) wheel.""" - def __init__(self, location: str, name: str): + def __init__(self, location: str, name: str, context: Optional[str] = None): self.location = location self.name = name + self.context = context def __str__(self) -> str: - return f"Wheel '{self.name}' located at {self.location} is invalid." + suffix = f" ({self.context})" if self.context else "." + return f"Wheel '{self.name}' located at {self.location} is invalid{suffix}" class MetadataInconsistent(InstallationError): diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 82ec50d5106..834316ea011 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -1,23 +1,36 @@ """Lazy ZIP over HTTP""" -__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url"] +from __future__ import annotations +import abc +import io +import logging +import re from bisect import bisect_left, bisect_right from contextlib import contextmanager from tempfile import NamedTemporaryFile -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, BinaryIO, ClassVar, Iterable, Iterator, cast +from urllib.parse import urlparse from zipfile import BadZipFile, ZipFile from pip._vendor.packaging.utils import canonicalize_name -from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response +from pip._vendor.requests.exceptions import ContentDecodingError +from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, HTTPError, Response +from pip._vendor.requests.status_codes import codes +from pip._internal.exceptions import InvalidWheel, UnsupportedWheel from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution from pip._internal.network.session import PipSession -from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks +from pip._internal.network.utils import HEADERS +from pip._internal.utils.logging import indent_log + +__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url", "LazyWheelOverHTTP"] + +logger = logging.getLogger(__name__) class HTTPRangeRequestUnsupported(Exception): - pass + """Raised when the remote server appears unable to support byte ranges.""" def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistribution: @@ -25,42 +38,86 @@ def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistrib This uses HTTP range requests to only fetch the portion of the wheel containing metadata, just enough for the object to be constructed. - If such requests are not supported, HTTPRangeRequestUnsupported - is raised. + + :raises HTTPRangeRequestUnsupported: if range requests are unsupported for ``url``. + :raises InvalidWheel: if the zip file contents could not be parsed. """ - with LazyZipOverHTTP(url, session) as zf: - # For read-only ZIP files, ZipFile only needs methods read, - # seek, seekable and tell, not the whole IO protocol. - wheel = MemoryWheel(zf.name, zf) # type: ignore - # After context manager exit, wheel.name - # is an invalid file by intention. - return get_wheel_distribution(wheel, canonicalize_name(name)) + try: + # After context manager exit, wheel.name will point to a deleted file path. + # Add `delete_backing_file=False` to disable this for debugging. + with LazyWheelOverHTTP(url, session) as lazy_file: + lazy_file.prefetch_contiguous_dist_info(name) + wheel = MemoryWheel(lazy_file.name, lazy_file) + return get_wheel_distribution(wheel, canonicalize_name(name)) + except (BadZipFile, UnsupportedWheel, ContentDecodingError) as e: + # We assume that these errors have occurred because the wheel contents themself + # are invalid, not because we've messed up our bookkeeping and produced an + # invalid file that pip would otherwise process normally. + raise InvalidWheel(url, name, context=str(e)) -class LazyZipOverHTTP: - """File-like object mapped to a ZIP file over HTTP. - This uses HTTP range requests to lazily fetch the file's content, - which is supposed to be fed to ZipFile. If such requests are not - supported by the server, raise HTTPRangeRequestUnsupported - during initialization. +class MergeIntervals: + """Stateful bookkeeping to merge interval graphs.""" + + def __init__(self, *, left: Iterable[int] = (), right: Iterable[int] = ()) -> None: + self._left = list(left) + self._right = list(right) + + def __repr__(self) -> str: + return f"{type(self).__name__}(left={tuple(self._left)}, right={tuple(self._right)})" # noqa: E501 + + def _merge( + self, start: int, end: int, left: int, right: int + ) -> Iterator[tuple[int, int]]: + """Return an iterator of intervals to be fetched. + + Args: + start (int): Start of needed interval + end (int): End of needed interval + left (int): Index of first overlapping downloaded data + right (int): Index after last overlapping downloaded data + """ + lslice, rslice = self._left[left:right], self._right[left:right] + i = start = min([start] + lslice[:1]) + end = max([end] + rslice[-1:]) + for j, k in zip(lslice, rslice): + if j > i: + yield i, j - 1 + i = k + 1 + if i <= end: + yield i, end + self._left[left:right], self._right[left:right] = [start], [end] + + def minimal_intervals_covering( + self, start: int, end: int + ) -> Iterator[tuple[int, int]]: + """Provide the intervals needed to cover from ``start <= x <= end``. + + This method mutates internal state so that later calls only return intervals not + covered by prior calls. The first call to this method will always return exactly + one interval, which was exactly the one requested. Later requests for + intervals overlapping that first requested interval will yield only the ranges + not previously covered (which may be empty, e.g. if the same interval is + requested twice). + + This may be used e.g. to download substrings of remote files on demand. + """ + left = bisect_left(self._right, start) + right = bisect_right(self._left, end) + for start, end in self._merge(start, end, left, right): + yield (start, end) + + +class ReadOnlyIOWrapper(BinaryIO): + """Implement read-side ``BinaryIO`` methods wrapping an inner ``BinaryIO``. + + This wrapper is useful because Python currently does not distinguish read-only + streams at the type level. """ - def __init__( - self, url: str, session: PipSession, chunk_size: int = CONTENT_CHUNK_SIZE - ) -> None: - head = session.head(url, headers=HEADERS) - raise_for_status(head) - assert head.status_code == 200 - self._session, self._url, self._chunk_size = session, url, chunk_size - self._length = int(head.headers["Content-Length"]) - self._file = NamedTemporaryFile() - self.truncate(self._length) - self._left: List[int] = [] - self._right: List[int] = [] - if "bytes" not in head.headers.get("Accept-Ranges", "none"): - raise HTTPRangeRequestUnsupported("range request is not supported") - self._check_zip() + def __init__(self, inner: BinaryIO) -> None: + self._file = inner @property def mode(self) -> str: @@ -85,6 +142,19 @@ def closed(self) -> bool: """Whether the file is closed.""" return self._file.closed + def fileno(self) -> int: + return self._file.fileno() + + def flush(self) -> None: + self._file.flush() + + def isatty(self) -> bool: + return False + + def readable(self) -> bool: + """Return whether the file is readable, which is True.""" + return True + def read(self, size: int = -1) -> bytes: """Read up to size bytes from the object and return them. @@ -92,16 +162,14 @@ def read(self, size: int = -1) -> bytes: all bytes until EOF are returned. Fewer than size bytes may be returned if EOF is reached. """ - download_size = max(size, self._chunk_size) - start, length = self.tell(), self._length - stop = length if size < 0 else min(start + download_size, length) - start = max(0, stop - download_size) - self._download(start, stop - 1) return self._file.read(size) - def readable(self) -> bool: - """Return whether the file is readable, which is True.""" - return True + def readline(self, limit: int = -1) -> bytes: + # Explicit impl needed to satisfy mypy. + raise NotImplementedError + + def readlines(self, hint: int = -1) -> list[bytes]: + raise NotImplementedError def seek(self, offset: int, whence: int = 0) -> int: """Change stream position and return the new absolute position. @@ -117,7 +185,7 @@ def tell(self) -> int: """Return the current position.""" return self._file.tell() - def truncate(self, size: Optional[int] = None) -> int: + def truncate(self, size: int | None = None) -> int: """Resize the stream to the given size in bytes. If size is unspecified resize to the current position. @@ -131,15 +199,81 @@ def writable(self) -> bool: """Return False.""" return False - def __enter__(self) -> "LazyZipOverHTTP": + def write(self, s: bytes) -> int: # type: ignore[override] + raise NotImplementedError + + def writelines(self, lines: Iterable[bytes]) -> None: # type: ignore[override] + raise NotImplementedError + + def __enter__(self) -> ReadOnlyIOWrapper: self._file.__enter__() return self def __exit__(self, *exc: Any) -> None: self._file.__exit__(*exc) + def __iter__(self) -> Iterator[bytes]: + raise NotImplementedError + + def __next__(self) -> bytes: + raise NotImplementedError + + +class LazyRemoteResource(ReadOnlyIOWrapper): + """Abstract class for a binary file object that lazily downloads its contents. + + This class uses a ``MergeIntervals`` instance to keep track of what it's downloaded. + """ + + def __init__(self, inner: BinaryIO) -> None: + super().__init__(inner) + self._merge_intervals: MergeIntervals | None = None + + @abc.abstractmethod + def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Call to the remote backend to provide this byte range in one or more chunks. + + NB: For compatibility with HTTP range requests, the range provided to this + method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2 + bytes long, and the range can never be empty). + + :raises Exception: implementations may raise an exception for e.g. intermittent + errors accessing the remote resource. + """ + ... + + def _setup_content(self) -> None: + """Ensure ``self._merge_intervals`` is initialized. + + Called in ``__enter__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is None: + self._merge_intervals = MergeIntervals() + + def _reset_content(self) -> None: + """Unset ``self._merge_intervals``. + + Called in ``__exit__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is not None: + logger.debug( + "unsetting merge intervals (were: %s)", repr(self._merge_intervals) + ) + self._merge_intervals = None + + def __enter__(self) -> LazyRemoteResource: + """Call ``self._setup_content()``, then return self.""" + super().__enter__() + self._setup_content() + return self + + def __exit__(self, *exc: Any) -> None: + """Call ``self._reset_content()``.""" + self._reset_content() + super().__exit__(*exc) + @contextmanager - def _stay(self) -> Generator[None, None, None]: + def _stay(self) -> Iterator[None]: """Return a context manager keeping the position. At the end of the block, seek back to original position. @@ -150,61 +284,465 @@ def _stay(self) -> Generator[None, None, None]: finally: self.seek(pos) - def _check_zip(self) -> None: - """Check and download until the file is a valid ZIP.""" - end = self._length - 1 - for start in reversed(range(0, end, self._chunk_size)): - self._download(start, end) - with self._stay(): - try: - # For read-only ZIP files, ZipFile only needs - # methods read, seek, seekable and tell. - ZipFile(self) # type: ignore - except BadZipFile: - pass - else: - break + def ensure_downloaded(self, start: int, end: int) -> None: + """Ensures bytes start to end (inclusive) have been downloaded and written to + the backing file. + + :raises ValueError: if ``__enter__`` was not called beforehand. + """ + if self._merge_intervals is None: + raise ValueError(".__enter__() must be called to set up merge intervals") + # Reducing by 1 to get an inclusive end range. + end -= 1 + with self._stay(): + for start, end in self._merge_intervals.minimal_intervals_covering( + start, end + ): + self.seek(start) + for chunk in self.fetch_content_range(start, end): + self._file.write(chunk) + - def _stream_response( - self, start: int, end: int, base_headers: Dict[str, str] = HEADERS - ) -> Response: - """Return HTTP response to a range request from start to end.""" - headers = base_headers.copy() +class FixedSizeLazyResource(LazyRemoteResource): + """Fetches a fixed content length from the remote server when initialized in order + to support ``.read(-1)`` and seek calls against ``io.SEEK_END``.""" + + def __init__(self, inner: BinaryIO) -> None: + super().__init__(inner) + self._length: int | None = None + + @abc.abstractmethod + def _fetch_content_length(self) -> int: + """Query the remote resource for the total length of the file. + + This method may also mutate internal state, such as writing to the backing + file. It's marked private because it will only be called within this class's + ``__enter__`` implementation to populate an internal length field. + + :raises Exception: implementations may raise any type of exception if the length + value could not be parsed, or any other issue which might + cause valid calls to ``self.fetch_content_range()`` to fail. + """ + ... + + def _setup_content(self) -> None: + """Initialize the internal length field and other bookkeeping. + + After parsing the remote file length with ``self._fetch_content_length()``, + this method will truncate the underlying file from parent abstract class + ``ReadOnlyIOWrapper`` to that size in order to support seek operations against + ``io.SEEK_END`` in ``self.read()``. + + This method is called in ``__enter__``. + """ + super()._setup_content() + + if self._length is None: + logger.debug("begin fetching content length") + with indent_log(): + self._length = self._fetch_content_length() + logger.debug("done fetching content length (is: %d)", self._length) + else: + logger.debug("content length already fetched (is: %d)", self._length) + + # Enable us to seek and write anywhere in the backing file up to this + # known length. + self.truncate(self._length) + + def _reset_content(self) -> None: + """Unset the internal length field and other bookkeeping. + + This method is called in ``__exit__``.""" + super()._reset_content() + + if self._length is not None: + logger.debug("unsetting content length (was: %d)", self._length) + self._length = None + + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + + :raises ValueError: if ``__enter__`` was not called beforehand. + """ + if self._length is None: + raise ValueError(".__enter__() must be called to set up content length") + cur = self.tell() + logger.debug("read size %d at %d from lazy file %s", size, cur, self.name) + if size < 0: + assert cur <= self._length + download_size = self._length - cur + elif size == 0: + return b"" + else: + download_size = size + stop = min(cur + download_size, self._length) + self.ensure_downloaded(cur, stop) + return super().read(download_size) + + +class LazyHTTPFile(FixedSizeLazyResource): + """File-like object representing a fixed-length file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content into a temporary + file. If such requests are not supported by the server, raises + ``HTTPRangeRequestUnsupported`` in the ``__enter__`` method.""" + + def __init__( + self, + url: str, + session: PipSession, + delete_backing_file: bool = True, + ) -> None: + super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file))) + + self._request_count = 0 + self._session = session + self._url = url + + def _fetch_content_length(self) -> int: + """Get the remote file's length from a HEAD request.""" + # NB: This is currently dead code, as _fetch_content_length() is overridden + # again in LazyWheelOverHTTP. + return self._content_length_from_head() + + def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Perform a series of HTTP range requests to cover the specified byte range.""" + for chunk in self._stream_response(start, end).iter_content(CONTENT_CHUNK_SIZE): + yield chunk + + def __enter__(self) -> LazyHTTPFile: + """Fetch the remote file length and reset the log of downloaded intervals. + + This method must be called before ``.read()``. + + :raises HTTPRangeRequestUnsupported: if the remote server fails to indicate + support for "bytes" ranges. + """ + super().__enter__() + return self + + def __exit__(self, *exc: Any) -> None: + """Logs request count to quickly identify any pathological cases in log data.""" + logger.debug("%d requests for url %s", self._request_count, self._url) + super().__exit__(*exc) + + def _content_length_from_head(self) -> int: + """Performs a HEAD request to extract the Content-Length. + + :raises HTTPRangeRequestUnsupported: if the response fails to indicate support + for "bytes" ranges.""" + self._request_count += 1 + head = self._session.head(self._url, headers=self._uncached_headers()) + head.raise_for_status() + assert head.status_code == codes.ok + accepted_range = head.headers.get("Accept-Ranges", None) + if accepted_range != "bytes": + raise HTTPRangeRequestUnsupported( + f"server does not support byte ranges: header was '{accepted_range}'" + ) + return int(head.headers["Content-Length"]) + + def _stream_response(self, start: int, end: int) -> Response: + """Return streaming HTTP response to a range request from start to end.""" + headers = self._uncached_headers() headers["Range"] = f"bytes={start}-{end}" - # TODO: Get range requests to be correctly cached - headers["Cache-Control"] = "no-cache" - return self._session.get(self._url, headers=headers, stream=True) + logger.debug("streamed bytes request: %s", headers["Range"]) + self._request_count += 1 + response = self._session.get(self._url, headers=headers, stream=True) + response.raise_for_status() + assert int(response.headers["Content-Length"]) == (end - start + 1) + return response + + @classmethod + def _uncached_headers(cls) -> dict[str, str]: + """HTTP headers to bypass any HTTP caching. + + The requests we perform in this file are intentionally small, and any caching + should be done at a higher level e.g. https://github.com/pypa/pip/issues/12184. + """ + # "no-cache" is the correct value for "up to date every time", so this will also + # ensure we get the most recent value from the server: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching#provide_up-to-date_content_every_time + return {**HEADERS, "Cache-Control": "no-cache"} - def _merge( - self, start: int, end: int, left: int, right: int - ) -> Generator[Tuple[int, int], None, None]: - """Return a generator of intervals to be fetched. - Args: - start (int): Start of needed interval - end (int): End of needed interval - left (int): Index of first overlapping downloaded data - right (int): Index after last overlapping downloaded data +class LazyWheelOverHTTP(LazyHTTPFile): + """File-like object mapped to a ZIP file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content, which should be + provided as the first argument to a ``ZipFile``. + """ + + # Cache this on the type to avoid trying and failing our initial lazy wheel request + # multiple times in the same pip invocation against an index without this support. + _domains_without_negative_range: ClassVar[set[str]] = set() + + # This override is needed for mypy so we can call .prefetch_contiguous_dist_info() + # within a `with` block. + def __enter__(self) -> LazyWheelOverHTTP: + """Fetch the remote file length and reset the log of downloaded intervals. + + This method must be called before ``.read()`` or + ``.prefetch_contiguous_dist_info()``. """ - lslice, rslice = self._left[left:right], self._right[left:right] - i = start = min([start] + lslice[:1]) - end = max([end] + rslice[-1:]) - for j, k in zip(lslice, rslice): - if j > i: - yield i, j - 1 - i = k + 1 - if i <= end: - yield i, end - self._left[left:right], self._right[left:right] = [start], [end] + super().__enter__() + return self - def _download(self, start: int, end: int) -> None: - """Download bytes from start to end inclusively.""" - with self._stay(): - left = bisect_left(self._right, start) - right = bisect_right(self._left, end) - for start, end in self._merge(start, end, left, right): - response = self._stream_response(start, end) - response.raise_for_status() - self.seek(start) - for chunk in response_chunks(response, self._chunk_size): - self._file.write(chunk) + @classmethod + def _initial_chunk_length(cls) -> int: + """Return the size of the chunk (in bytes) to download from the end of the file. + + This method is called in ``self._fetch_content_length()``. If + the chunk size from this method is larger than the size of an entire wheel, that + may raise an HTTP error, but this is gracefully handled in + ``self._fetch_content_length()`` with a small performance penalty. + + Performance testing for values returned by this method (radoering): + https://github.com/pypa/pip/pull/12208#discussion_r1434335276 + > could not find any significant differences for values between 1.000 and + > 100.000 in a real-world example with more than 300 dependencies. Values below + > 1.000 resulted in a slightly worse performance, larger values like 1.000.000 + > resulted in a significant worse performance. + """ + return 10_000 + + def _fetch_content_length(self) -> int: + """Get the total remote file length, but also download a chunk from the end. + + This method is called within ``__enter__``. In an attempt to reduce + the total number of requests needed to populate this lazy file's contents, this + method will also attempt to fetch a chunk of the file's actual content. This + chunk will be ``self._initial_chunk_length()`` bytes in size, or just the remote + file's length if that's smaller, and the chunk will come from the *end* of + the file. + + This method will first attempt to download with a negative byte range request, + i.e. a GET with the headers ``Range: bytes=-N`` for ``N`` equal to + ``self._initial_chunk_length()``. If negative offsets are unsupported, it will + instead fall back to making a HEAD request first to extract the length, followed + by a GET request with the double-ended range header ``Range: bytes=X-Y`` to + extract the final ``N`` bytes from the remote resource. + """ + initial_chunk_size = self._initial_chunk_length() + ret_length, tail = self._extract_content_length(initial_chunk_size) + + # Need to explicitly truncate here in order to perform the write and seek + # operations below when we write the chunk of file contents to disk. + self.truncate(ret_length) + + if tail is None: + # If we could not download any file contents yet (e.g. if negative byte + # ranges were not supported, or the requested range was larger than the file + # size), then download all of this at once, hopefully pulling in the entire + # central directory. + initial_start = max(0, ret_length - initial_chunk_size) + self.ensure_downloaded(initial_start, ret_length) + else: + # If we *could* download some file contents, then write them to the end of + # the file and set up our bisect boundaries by hand. + with self._stay(): + response_length = int(tail.headers["Content-Length"]) + + # Some servers return a 200 OK in response to an overflowing negative + # range request. In that case we want to write the entire file contents. + if tail.status_code == codes.ok: + assert ret_length == response_length + assert ret_length > 0 + assert response_length <= initial_chunk_size + for chunk in tail.iter_content(CONTENT_CHUNK_SIZE): + self._file.write(chunk) + assert self._merge_intervals is not None + assert ((0, response_length - 1),) == tuple( + self._merge_intervals.minimal_intervals_covering( + 0, response_length - 1 + ) + ) + return response_length + else: + assert response_length == initial_chunk_size + self.seek(-response_length, io.SEEK_END) + # Default initial chunk size is currently 1MB, but streaming content + # here allows it to be set arbitrarily large. + for chunk in tail.iter_content(CONTENT_CHUNK_SIZE): + self._file.write(chunk) + + # We now need to update our bookkeeping to cover the interval we + # just wrote to file so we know not to do it in later read()s. + init_chunk_start = ret_length - response_length + # MergeIntervals uses inclusive boundaries i.e. start <= x <= end. + init_chunk_end = ret_length - 1 + assert self._merge_intervals is not None + assert ((init_chunk_start, init_chunk_end),) == tuple( + # NB: We expect LazyRemoteResource to reset + # `self._merge_intervals` just before it calls the current + # method, so our assertion here checks that indeed no prior + # overlapping intervals have been covered. + self._merge_intervals.minimal_intervals_covering( + init_chunk_start, init_chunk_end + ) + ) + return ret_length + + @staticmethod + def _parse_full_length_from_content_range(arg: str) -> int: + """Parse the file's full underlying length from the Content-Range header. + + This supports both * and numeric ranges, from success or error responses: + https://www.rfc-editor.org/rfc/rfc9110#field.content-range. + """ + m = re.match(r"bytes [^/]+/([0-9]+)", arg) + if m is None: + raise HTTPRangeRequestUnsupported(f"could not parse Content-Range: '{arg}'") + return int(m.group(1)) + + def _try_initial_chunk_request( + self, initial_chunk_size: int + ) -> tuple[int, Response]: + """Attempt to fetch a chunk from the end of the file with a negative offset.""" + headers = self._uncached_headers() + # Perform a negative range index, which is not supported by some servers. + headers["Range"] = f"bytes=-{initial_chunk_size}" + logger.debug("initial bytes request: %s", headers["Range"]) + + self._request_count += 1 + tail = self._session.get(self._url, headers=headers, stream=True) + tail.raise_for_status() + + code = tail.status_code + if code != codes.partial_content: + # According to + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests, a 200 OK + # implies that range requests are not supported, regardless of the + # requested size. + # However, some servers that support negative range requests also return a + # 200 OK if the requested range from the end was larger than the file size. + if code == codes.ok: + accept_ranges = tail.headers.get("Accept-Ranges", None) + content_length = int(tail.headers["Content-Length"]) + if accept_ranges == "bytes" and content_length <= initial_chunk_size: + return content_length, tail + raise HTTPRangeRequestUnsupported( + f"did not receive partial content: got code {code}" + ) + + file_length = self._parse_full_length_from_content_range( + tail.headers["Content-Range"] + ) + return (file_length, tail) + + def _extract_content_length( + self, initial_chunk_size: int + ) -> tuple[int, Response | None]: + """Get the Content-Length of the remote file, and possibly a chunk of it.""" + domain = urlparse(self._url).netloc + if domain in self._domains_without_negative_range: + return (self._content_length_from_head(), None) + + try: + # Initial range request for just the end of the file. + file_length, tail = self._try_initial_chunk_request(initial_chunk_size) + except HTTPError as e: + resp = e.response + code = resp.status_code + # Our initial request using a negative byte range was not supported. + if code in [codes.not_implemented, codes.method_not_allowed]: + # pypi notably does not support negative byte ranges: see + # https://github.com/pypi/warehouse/issues/12823. + logger.debug( + "Negative byte range not supported for domain '%s': " + "using HEAD request before lazy wheel from now on", + domain, + ) + # Avoid trying a negative byte range request against this domain for the + # rest of the resolve. + self._domains_without_negative_range.add(domain) + # Apply a HEAD request to get the real size, and nothing else for now. + return (self._content_length_from_head(), None) + # This indicates that the requested range from the end was larger than the + # actual file size: https://www.rfc-editor.org/rfc/rfc9110#status.416. + if code == codes.requested_range_not_satisfiable: + # In this case, we don't have any file content yet, but we do know the + # size the file will be, so we can return that and exit here. + file_length = self._parse_full_length_from_content_range( + resp.headers["Content-Range"] + ) + return (file_length, None) + # If we get some other error, then we expect that non-range requests will + # also fail, so we error out here and let the user figure it out. + raise + + # Some servers do not actually support negative offsets, but instead sneakily + # coerce a negative offset like "-10" into "0-10", which is a *positive* offset! + tail_length = int(tail.headers["Content-Length"]) + tail_range = tail.headers.get("Content-Range", "") + if (tail_length == initial_chunk_size + 1) and tail_range.startswith( + f"bytes 0-{initial_chunk_size}" + ): + logger.debug( + "an intended negative range -%d was converted to 0-%d for domain %s: " + "using HEAD request before lazy wheel from now on", + initial_chunk_size, + initial_chunk_size, + domain, + ) + # Avoid trying a negative byte range request against this domain for the + # rest of the resolve. + self._domains_without_negative_range.add(domain) + # Apply a HEAD request to get the real size, and nothing else for now. + return (self._content_length_from_head(), None) + return file_length, tail + + def prefetch_contiguous_dist_info(self, name: str) -> None: + """Read contents of entire dist-info section of wheel. + + We know pip will read every entry in this directory when generating a dist from + a wheel, so prepopulating the file contents avoids waiting for further + range requests. + """ + # Clarify in debug output which requests were sent during __init__, which during + # the prefetch, and which during the dist metadata generation. + logger.debug("begin prefetching dist-info for %s", name) + with indent_log(): + self._prefetch_contiguous_dist_info(name) + logger.debug("done prefetching dist-info for %s", name) + + def _prefetch_contiguous_dist_info(self, name: str) -> None: + """Locate the *.dist-info/ entries from a temporary ``ZipFile`` wrapper, and + download them. + + This method assumes that the *.dist-info directory (containing e.g. METADATA) is + contained in a single contiguous section of the zip file in order to ensure it + can be downloaded in a single ranged GET request.""" + dist_info_prefix = re.compile(r"^[^/]*\.dist-info/") + start: int | None = None + end: int | None = None + + # This may perform further requests if __init__() did not pull in the entire + # central directory at the end of the file (although _initial_chunk_length() + # should be set large enough to avoid this). + zf = ZipFile(self) + + for info in zf.infolist(): + if start is None: + if dist_info_prefix.search(info.filename): + start = info.header_offset + continue + else: + # The last .dist-info/ entry may be before the end of the file if the + # wheel's entries are sorted lexicographically (which is unusual). + if not dist_info_prefix.search(info.filename): + end = info.header_offset + break + if start is None: + raise UnsupportedWheel( + f"no {dist_info_prefix!r} directory found for {name} in {self.name}" + ) + # If the last entries of the zip are the .dist-info/ dir (as usual), then give + # us everything until the start of the central directory. + if end is None: + end = zf.start_dir + self.ensure_downloaded(start, end) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index e6aa3447200..7ca0855ad76 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -273,6 +273,9 @@ def __init__( # Previous "header" printed for a link-based InstallRequirement self._previous_requirement_header = ("", "") + # Whether an HTTPRangeRequestUnsupported was already thrown for this domain. + self._domains_without_http_range: set[str] = set() + def _log_preparing_link(self, req: InstallRequirement) -> None: """Provide context for the requirement being prepared.""" if req.link.is_file and not req.is_wheel_from_cache: @@ -430,6 +433,9 @@ def _fetch_metadata_using_lazy_wheel( ) return None + if link.netloc in self._domains_without_http_range: + return None + wheel = Wheel(link.filename) name = canonicalize_name(wheel.name) logger.info( @@ -437,11 +443,16 @@ def _fetch_metadata_using_lazy_wheel( name, wheel.version, ) - url = link.url.split("#", 1)[0] try: - return dist_from_wheel_url(name, url, self._session) - except HTTPRangeRequestUnsupported: - logger.debug("%s does not support range requests", url) + return dist_from_wheel_url(name, link.url_without_fragment, self._session) + except HTTPRangeRequestUnsupported as e: + logger.debug( + "domain '%s' does not support range requests (%s): " + "not attempting lazy wheel further", + link.netloc, + str(e), + ) + self._domains_without_http_range.add(link.netloc) return None def _complete_partial_requirements( diff --git a/src/pip/_internal/utils/wheel.py b/src/pip/_internal/utils/wheel.py index f85aee8a3f9..8474d818fcc 100644 --- a/src/pip/_internal/utils/wheel.py +++ b/src/pip/_internal/utils/wheel.py @@ -67,6 +67,10 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str: def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes: + if src_fp := source.fp: + src_name = getattr(src_fp, "name", None) + logger.debug("extracting entry '%s' from zip '%s'", path, src_name) + try: return source.read(path) # BadZipFile for general corruption, KeyError for missing entry, diff --git a/tests/conftest.py b/tests/conftest.py index da4ab5b9dfb..e2f4a2feb04 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import abc import compileall import contextlib import fnmatch @@ -8,6 +9,7 @@ import subprocess import sys import threading +from contextlib import AbstractContextManager, contextmanager from dataclasses import dataclass from enum import Enum from hashlib import sha256 @@ -17,6 +19,7 @@ TYPE_CHECKING, Any, AnyStr, + BinaryIO, Callable, ClassVar, ContextManager, @@ -27,6 +30,8 @@ Optional, Set, Tuple, + Type, + Union, ) from unittest.mock import patch from zipfile import ZipFile @@ -55,6 +60,7 @@ PipTestEnvironment, ScriptFactory, TestData, + create_basic_wheel_for_package, ) from tests.lib.server import MockServer, make_mock_server from tests.lib.venv import VirtualEnvironment, VirtualEnvironmentType @@ -651,6 +657,24 @@ def script( return script_factory(tmpdir.joinpath("workspace"), virtualenv) +@pytest.fixture(scope="session") +def session_script( + request: pytest.FixtureRequest, + tmpdir_factory: pytest.TempPathFactory, + virtualenv_factory: Callable[[Path], VirtualEnvironment], + script_factory: ScriptFactory, +) -> PipTestEnvironment: + """PipTestEnvironment shared across the whole session. + + This is used by session-scoped fixtures. Tests should use the + function-scoped ``script`` fixture instead. + """ + virtualenv = virtualenv_factory( + tmpdir_factory.mktemp("session_venv").joinpath("venv") + ) + return script_factory(tmpdir_factory.mktemp("session_workspace"), virtualenv) + + @pytest.fixture(scope="session") def common_wheels() -> Path: """Provide a directory with latest setuptools and wheel wheels""" @@ -733,6 +757,39 @@ class MetadataKind(Enum): NoFile = "no-file" +@dataclass(frozen=True) +class FakePackageSource: + """A test package file which may be hardcoded or generated dynamically.""" + + source_file: Union[str, Path] + + @classmethod + def shared_data_package(cls, name: str) -> "FakePackageSource": + return cls(source_file=name) + + @property + def _is_shared_data(self) -> bool: + return isinstance(self.source_file, str) + + @classmethod + def generated_wheel(cls, path: Path) -> "FakePackageSource": + return cls(source_file=path) + + @property + def filename(self) -> str: + if self._is_shared_data: + assert isinstance(self.source_file, str) + return self.source_file + assert isinstance(self.source_file, Path) + return self.source_file.name + + def source_path(self, shared_data: TestData) -> Path: + if self._is_shared_data: + return shared_data.packages / self.filename + assert isinstance(self.source_file, Path) + return self.source_file + + @dataclass(frozen=True) class FakePackage: """Mock package structure used to generate a PyPI repository. @@ -742,13 +799,20 @@ class FakePackage: name: str version: str - filename: str + source_file: FakePackageSource metadata: MetadataKind # This will override any dependencies specified in the actual dist's METADATA. requires_dist: Tuple[str, ...] = () # This will override the Name specified in the actual dist's METADATA. metadata_name: Optional[str] = None + @property + def filename(self) -> str: + return self.source_file.filename + + def source_path(self, shared_data: TestData) -> Path: + return self.source_file.source_path(shared_data) + def metadata_filename(self) -> str: """This is specified by PEP 658.""" return f"{self.filename}.metadata" @@ -786,14 +850,49 @@ def generate_metadata(self) -> bytes: @pytest.fixture(scope="session") -def fake_packages() -> Dict[str, List[FakePackage]]: +def fake_packages(session_script: PipTestEnvironment) -> Dict[str, List[FakePackage]]: """The package database we generate for testing PEP 658 support.""" + large_compilewheel_metadata_first = create_basic_wheel_for_package( + session_script, + "compilewheel", + "2.0", + extra_files={"asdf.txt": b"a" * 10_000}, + metadata_first=True, + ) + # This wheel must be larger than 10KB to trigger the lazy wheel behavior we want + # to test. + assert large_compilewheel_metadata_first.stat().st_size > 10_000 + + large_translationstring_metadata_last = create_basic_wheel_for_package( + session_script, + "translationstring", + "0.1", + extra_files={"asdf.txt": b"a" * 10_000}, + metadata_first=False, + ) + assert large_translationstring_metadata_last.stat().st_size > 10_000 + return { "simple": [ - FakePackage("simple", "1.0", "simple-1.0.tar.gz", MetadataKind.Sha256), - FakePackage("simple", "2.0", "simple-2.0.tar.gz", MetadataKind.No), + FakePackage( + "simple", + "1.0", + FakePackageSource.shared_data_package("simple-1.0.tar.gz"), + MetadataKind.Sha256, + ), + FakePackage( + "simple", + "2.0", + FakePackageSource.shared_data_package("simple-2.0.tar.gz"), + MetadataKind.No, + ), # This will raise a hashing error. - FakePackage("simple", "3.0", "simple-3.0.tar.gz", MetadataKind.WrongHash), + FakePackage( + "simple", + "3.0", + FakePackageSource.shared_data_package("simple-3.0.tar.gz"), + MetadataKind.WrongHash, + ), ], "simple2": [ # Override the dependencies here in order to force pip to download @@ -801,17 +900,22 @@ def fake_packages() -> Dict[str, List[FakePackage]]: FakePackage( "simple2", "1.0", - "simple2-1.0.tar.gz", + FakePackageSource.shared_data_package("simple2-1.0.tar.gz"), MetadataKind.Unhashed, ("simple==1.0",), ), # This will raise an error when pip attempts to fetch the metadata file. - FakePackage("simple2", "2.0", "simple2-2.0.tar.gz", MetadataKind.NoFile), + FakePackage( + "simple2", + "2.0", + FakePackageSource.shared_data_package("simple2-2.0.tar.gz"), + MetadataKind.NoFile, + ), # This has a METADATA file with a mismatched name. FakePackage( "simple2", "3.0", - "simple2-3.0.tar.gz", + FakePackageSource.shared_data_package("simple2-3.0.tar.gz"), MetadataKind.Sha256, metadata_name="not-simple2", ), @@ -822,7 +926,9 @@ def fake_packages() -> Dict[str, List[FakePackage]]: FakePackage( "colander", "0.9.9", - "colander-0.9.9-py2.py3-none-any.whl", + FakePackageSource.shared_data_package( + "colander-0.9.9-py2.py3-none-any.whl" + ), MetadataKind.No, ), ], @@ -832,25 +938,47 @@ def fake_packages() -> Dict[str, List[FakePackage]]: FakePackage( "compilewheel", "1.0", - "compilewheel-1.0-py2.py3-none-any.whl", + FakePackageSource.shared_data_package( + "compilewheel-1.0-py2.py3-none-any.whl" + ), MetadataKind.Unhashed, ("simple==1.0",), ), + # This inserts a wheel larger than the default fast-deps request size with + # .dist-info metadata at the front. + FakePackage( + "compilewheel", + "2.0", + FakePackageSource.generated_wheel(large_compilewheel_metadata_first), + MetadataKind.No, + ), ], "has-script": [ # Ensure we check PEP 658 metadata hashing errors for wheel files. FakePackage( "has-script", "1.0", - "has.script-1.0-py2.py3-none-any.whl", + FakePackageSource.shared_data_package( + "has.script-1.0-py2.py3-none-any.whl" + ), MetadataKind.WrongHash, ), ], "translationstring": [ + # This inserts a wheel larger than the default fast-deps request size with + # .dist-info metadata at the back. + FakePackage( + "translationstring", + "0.1", + FakePackageSource.generated_wheel( + large_translationstring_metadata_last + ), + MetadataKind.No, + ), FakePackage( "translationstring", "1.1", - "translationstring-1.1.tar.gz", + FakePackageSource.shared_data_package("translationstring-1.1.tar.gz"), MetadataKind.No, ), ], @@ -859,7 +987,9 @@ def fake_packages() -> Dict[str, List[FakePackage]]: FakePackage( "priority", "1.0", - "priority-1.0-py2.py3-none-any.whl", + FakePackageSource.shared_data_package( + "priority-1.0-py2.py3-none-any.whl" + ), MetadataKind.NoFile, ), ], @@ -868,7 +998,9 @@ def fake_packages() -> Dict[str, List[FakePackage]]: FakePackage( "requires-simple-extra", "0.1", - "requires_simple_extra-0.1-py2.py3-none-any.whl", + FakePackageSource.shared_data_package( + "requires_simple_extra-0.1-py2.py3-none-any.whl" + ), MetadataKind.Sha256, metadata_name="Requires_Simple.Extra", ), @@ -921,9 +1053,11 @@ def html_index_for_packages( download_links.append( f' {package_link.filename}
' # noqa: E501 ) - # (3.2) Copy over the corresponding file in `shared_data.packages`. + # (3.2) Copy over the corresponding file in `shared_data.packages`, or the + # generated wheel path if provided. + source_path = package_link.source_path(shared_data) shutil.copy( - shared_data.packages / package_link.filename, + source_path, pkg_subdir / package_link.filename, ) # (3.3) Write a metadata file, if applicable. @@ -959,6 +1093,7 @@ class OneTimeDownloadHandler(http.server.SimpleHTTPRequestHandler): """Serve files from the current directory, but error if a file is downloaded more than once.""" + # NB: Needs to be set on per-function subclass. _seen_paths: ClassVar[Set[str]] = set() def do_GET(self) -> None: @@ -1004,3 +1139,241 @@ class Handler(OneTimeDownloadHandler): finally: httpd.shutdown() server_thread.join() + + +class RangeHandler(Enum): + """All the modes of handling range requests we want pip to handle.""" + + Always200OK = "always-200-ok" + NoNegativeRange = "no-negative-range" + SneakilyCoerceNegativeRange = "sneakily-coerce-negative-range" + SupportsNegativeRange = "supports-negative-range" + NegativeRangeOverflowing = "negative-range-overflowing" + + def supports_range(self) -> bool: + return self in [ + type(self).NoNegativeRange, + type(self).SneakilyCoerceNegativeRange, + type(self).SupportsNegativeRange, + type(self).NegativeRangeOverflowing, + ] + + def supports_negative_range(self) -> bool: + return self in [ + type(self).SupportsNegativeRange, + type(self).NegativeRangeOverflowing, + ] + + def sneakily_coerces_negative_range(self) -> bool: + return self == type(self).SneakilyCoerceNegativeRange + + def overflows_negative_range(self) -> bool: + return self == type(self).NegativeRangeOverflowing + + +class ContentRangeDownloadHandler( + http.server.SimpleHTTPRequestHandler, metaclass=abc.ABCMeta +): + """Extend the basic ``http.server`` to support content ranges.""" + + @abc.abstractproperty + def range_handler(self) -> RangeHandler: ... + + # NB: Needs to be set on per-function subclasses. + get_request_counts: ClassVar[Dict[str, int]] = {} + positive_range_request_paths: ClassVar[Set[str]] = set() + negative_range_request_paths: ClassVar[Set[str]] = set() + head_request_paths: ClassVar[Set[str]] = set() + ok_response_counts: ClassVar[Dict[str, int]] = {} + + @contextmanager + def _translate_path(self) -> Iterator[Optional[Tuple[BinaryIO, str, int]]]: + # Only test fast-deps, not PEP 658. + if self.path.endswith(".metadata"): + self.send_error(http.HTTPStatus.NOT_FOUND, "File not found") + yield None + return + + path = self.translate_path(self.path) + if os.path.isdir(path): + path = os.path.join(path, "index.html") + + ctype = self.guess_type(path) + try: + with open(path, "rb") as f: + fs = os.fstat(f.fileno()) + full_file_length = fs[6] + + yield (f, ctype, full_file_length) + except OSError: + self.send_error(http.HTTPStatus.NOT_FOUND, "File not found") + yield None + return + + def _send_basic_headers(self, ctype: str) -> None: + self.send_header("Content-Type", ctype) + if self.range_handler.supports_range(): + self.send_header("Accept-Ranges", "bytes") + # NB: callers must call self.end_headers()! + + def _send_full_file_headers(self, ctype: str, full_file_length: int) -> None: + self.send_response(http.HTTPStatus.OK) + self.ok_response_counts.setdefault(self.path, 0) + self.ok_response_counts[self.path] += 1 + self._send_basic_headers(ctype) + self.send_header("Content-Length", str(full_file_length)) + self.end_headers() + + def do_HEAD(self) -> None: + self.head_request_paths.add(self.path) + + with self._translate_path() as x: + if x is None: + return + (_, ctype, full_file_length) = x + self._send_full_file_headers(ctype, full_file_length) + + def do_GET(self) -> None: + self.get_request_counts.setdefault(self.path, 0) + self.get_request_counts[self.path] += 1 + + with self._translate_path() as x: + if x is None: + return + (f, ctype, full_file_length) = x + range_arg = self.headers.get("Range", None) + if range_arg is not None: + m = re.match(r"bytes=([0-9]+)?-([0-9]+)", range_arg) + if m is not None: + if m.group(1) is None: + self.negative_range_request_paths.add(self.path) + else: + self.positive_range_request_paths.add(self.path) + # If no range given, return the whole file. + if range_arg is None or not self.range_handler.supports_range(): + self._send_full_file_headers(ctype, full_file_length) + self.copyfile(f, self.wfile) # type: ignore[misc] + return + # Otherwise, return the requested contents. + assert m is not None + # This is a "start-end" range. + if m.group(1) is not None: + start = int(m.group(1)) + end = int(m.group(2)) + assert start <= end + was_out_of_bounds = (end + 1) > full_file_length + else: + # This is a "-end" range. + if self.range_handler.sneakily_coerces_negative_range(): + end = int(m.group(2)) + self.send_response(http.HTTPStatus.PARTIAL_CONTENT) + self._send_basic_headers(ctype) + self.send_header("Content-Length", str(end + 1)) + self.send_header( + "Content-Range", f"bytes 0-{end}/{full_file_length}" + ) + self.end_headers() + f.seek(0) + self.wfile.write(f.read(end + 1)) + return + if not self.range_handler.supports_negative_range(): + self.send_response(http.HTTPStatus.NOT_IMPLEMENTED) + self._send_basic_headers(ctype) + self.end_headers() + return + end = full_file_length - 1 + start = end - int(m.group(2)) + 1 + was_out_of_bounds = start < 0 + if was_out_of_bounds: + if self.range_handler.overflows_negative_range(): + self._send_full_file_headers(ctype, full_file_length) + self.copyfile(f, self.wfile) # type: ignore[misc] + return + self.send_response(http.HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE) + self._send_basic_headers(ctype) + self.send_header("Content-Range", f"bytes */{full_file_length}") + self.end_headers() + return + sent_length = end - start + 1 + self.send_response(http.HTTPStatus.PARTIAL_CONTENT) + self._send_basic_headers(ctype) + self.send_header("Content-Length", str(sent_length)) + self.send_header("Content-Range", f"bytes {start}-{end}/{full_file_length}") + self.end_headers() + f.seek(start) + self.wfile.write(f.read(sent_length)) + + +@pytest.fixture(scope="session") +def html_index_no_metadata( + html_index_for_packages: Path, + tmpdir_factory: pytest.TempPathFactory, +) -> Path: + """Return an index like ``html_index_for_packages`` without any PEP 658 metadata. + + While we already return a 404 in ``ContentRangeDownloadHandler`` for ``.metadata`` + paths, we need to also remove ``data-dist-info-metadata`` attrs on ```` tags, + otherwise pip will error after attempting to retrieve the metadata files.""" + new_html_dir = tmpdir_factory.mktemp("fake_index_html_content_no_metadata") + new_html_dir.rmdir() + shutil.copytree(html_index_for_packages, new_html_dir) + for index_page in new_html_dir.rglob("index.html"): + prev_index = index_page.read_text() + no_metadata_index = re.sub(r'data-dist-info-metadata="[^"]+"', "", prev_index) + index_page.write_text(no_metadata_index) + return new_html_dir + + +HTMLIndexWithRangeServer = Callable[ + [RangeHandler], + "AbstractContextManager[Type[ContentRangeDownloadHandler]]", +] + + +@pytest.fixture +def html_index_with_range_server( + html_index_no_metadata: Path, + port: int = 8000, +) -> HTMLIndexWithRangeServer: + """Serve files from a generated pypi index, with support for range requests. + + Provide `-i http://localhost:` to pip invocations to point them at + this server. + """ + + class InDirectoryServer(http.server.ThreadingHTTPServer): + def finish_request(self, request: Any, client_address: Any) -> None: + self.RequestHandlerClass( + request, client_address, self, directory=str(html_index_no_metadata) # type: ignore[call-arg,arg-type] + ) + + @contextmanager + def inner( + range_handler: RangeHandler, + ) -> Iterator[Type[ContentRangeDownloadHandler]]: + class Handler(ContentRangeDownloadHandler): + @property + def range_handler(self) -> RangeHandler: + return range_handler + + get_request_counts: ClassVar[Dict[str, int]] = {} + positive_range_request_paths: ClassVar[Set[str]] = set() + negative_range_request_paths: ClassVar[Set[str]] = set() + head_request_paths: ClassVar[Set[str]] = set() + ok_response_counts: ClassVar[Dict[str, int]] = {} + + with InDirectoryServer(("", port), Handler) as httpd: + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.start() + + try: + yield Handler + finally: + httpd.shutdown() + server_thread.join(timeout=3.0) + if server_thread.is_alive(): + raise RuntimeError( + "failed to shutdown http server within 3 seconds" + ) + + return inner diff --git a/tests/functional/test_download.py b/tests/functional/test_download.py index d469e71c360..b2a82b68d93 100644 --- a/tests/functional/test_download.py +++ b/tests/functional/test_download.py @@ -1306,7 +1306,7 @@ def run_for_generated_index( ["colander-0.9.9-py2.py3-none-any.whl", "translationstring-1.1.tar.gz"], ), ( - "compilewheel", + "compilewheel==1.0", ["compilewheel-1.0-py2.py3-none-any.whl", "simple-1.0.tar.gz"], ), ], @@ -1339,7 +1339,7 @@ def test_download_metadata( "/colander/colander-0.9.9-py2.py3-none-any.whl", ), ( - "compilewheel", + "compilewheel==1.0", [ "compilewheel-1.0-py2.py3-none-any.whl", "simple-1.0.tar.gz", diff --git a/tests/functional/test_fast_deps.py b/tests/functional/test_fast_deps.py index 5a910b89763..0e7e45eabc4 100644 --- a/tests/functional/test_fast_deps.py +++ b/tests/functional/test_fast_deps.py @@ -4,12 +4,14 @@ import pathlib import re from os.path import basename -from typing import Iterable +from pathlib import Path +from typing import Iterable, List import pytest from pip._vendor.packaging.utils import canonicalize_name from pip._internal.utils.misc import hash_file +from tests.conftest import HTMLIndexWithRangeServer, RangeHandler from tests.lib import PipTestEnvironment, TestData, TestPipResult @@ -20,6 +22,7 @@ def pip(script: PipTestEnvironment, command: str, requirement: str) -> TestPipRe "--no-cache-dir", "--use-feature=fast-deps", requirement, + # TODO: remove this when fast-deps is on by default. allow_stderr_warning=True, ) @@ -45,6 +48,31 @@ def test_install_from_pypi( assert_installed(script, expected) +@pytest.mark.network +@pytest.mark.parametrize( + "requirement, url, expected", + [ + ( + "wcwidth==0.2.1", + "https://files.pythonhosted.org/packages/6c/a6/cdb485093ad4017d874d7a2e6a736d02720258f57876548eea2bf04c76f0/wcwidth-0.2.1-py2.py3-none-any.whl", + "multiple .dist-info directories found", + ), + ], +) +def test_invalid_wheel_parse_error( + requirement: str, url: str, expected: str, script: PipTestEnvironment +) -> None: + """Check for both the full download URL and the reason for the error.""" + result = script.pip( + "install", + "--use-feature=fast-deps", + requirement, + expect_error=True, + ) + assert url in result.stderr + assert expected in result.stderr + + @pytest.mark.network @pytest.mark.parametrize( "requirement, expected", @@ -136,3 +164,188 @@ def test_hash_mismatch_existing_download_for_metadata_only_wheel( hash_file(str(idna_wheel))[0].hexdigest() == "b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" ) + + +@pytest.mark.parametrize("range_handler", list(RangeHandler)) +def test_download_range( + script: PipTestEnvironment, + tmpdir: Path, + html_index_with_range_server: HTMLIndexWithRangeServer, + range_handler: RangeHandler, +) -> None: + """Execute `pip download` against a generated PyPI index.""" + download_dir = tmpdir / "download_dir" + + def run_for_generated_index(args: List[str]) -> TestPipResult: + """ + Produce a PyPI directory structure pointing to the specified packages, then + execute `pip download -i ...` pointing to our generated index. + """ + pip_args = [ + "download", + "--use-feature=fast-deps", + "-d", + str(download_dir), + "-i", + "http://localhost:8000", + *args, + ] + return script.pip(*pip_args, allow_stderr_warning=True) + + with html_index_with_range_server(range_handler) as handler: + run_for_generated_index( + ["colander", "compilewheel==2.0", "has-script", "translationstring==0.1"] + ) + generated_files = os.listdir(download_dir) + assert fnmatch.filter(generated_files, "colander*.whl") + assert fnmatch.filter(generated_files, "compilewheel*.whl") + assert fnmatch.filter(generated_files, "has.script*.whl") + assert fnmatch.filter(generated_files, "translationstring*.whl") + + colander_wheel_path = "/colander/colander-0.9.9-py2.py3-none-any.whl" + compile_wheel_path = "/compilewheel/compilewheel-2.0-py2.py3-none-any.whl" + has_script_path = "/has-script/has.script-1.0-py2.py3-none-any.whl" + translationstring_path = ( + "/translationstring/translationstring-0.1-py2.py3-none-any.whl" + ) + + if range_handler == RangeHandler.Always200OK: + assert not handler.head_request_paths + assert not handler.positive_range_request_paths + assert {colander_wheel_path} == handler.negative_range_request_paths + # Tries a range request, finds it's unsupported, so doesn't try it again. + assert handler.get_request_counts[colander_wheel_path] == 2 + assert handler.ok_response_counts[colander_wheel_path] == 2 + assert handler.get_request_counts[compile_wheel_path] == 1 + assert handler.ok_response_counts[compile_wheel_path] == 1 + assert handler.get_request_counts[has_script_path] == 1 + assert handler.ok_response_counts[has_script_path] == 1 + assert handler.get_request_counts[translationstring_path] == 1 + assert handler.ok_response_counts[translationstring_path] == 1 + elif range_handler == RangeHandler.NoNegativeRange: + assert { + colander_wheel_path, + compile_wheel_path, + has_script_path, + translationstring_path, + } == handler.head_request_paths + assert { + colander_wheel_path, + compile_wheel_path, + has_script_path, + translationstring_path, + } == handler.positive_range_request_paths + # Tries this first, finds that negative offsets are unsupported, so doesn't + # try it again. + assert {colander_wheel_path} == handler.negative_range_request_paths + # Two more for the first wheel, because it has the failing negative + # byte request and is larger than the initial chunk size. + assert handler.get_request_counts[colander_wheel_path] == 4 + assert handler.ok_response_counts[colander_wheel_path] == 2 + # The .dist-info dir at the start requires an additional ranged GET vs + # translationstring. + assert handler.get_request_counts[compile_wheel_path] == 3 + assert handler.ok_response_counts[compile_wheel_path] == 2 + # The entire file should have been pulled in with a single ranged GET. + assert handler.get_request_counts[has_script_path] == 2 + assert handler.ok_response_counts[has_script_path] == 2 + # The entire .dist-info dir should have been pulled in with a single + # ranged GET. The second GET is for the end of the download, pulling down + # the entire file contents. + assert handler.get_request_counts[translationstring_path] == 2 + assert handler.ok_response_counts[translationstring_path] == 2 + elif range_handler == RangeHandler.SneakilyCoerceNegativeRange: + assert { + colander_wheel_path, + compile_wheel_path, + has_script_path, + translationstring_path, + } == handler.head_request_paths + assert { + colander_wheel_path, + compile_wheel_path, + has_script_path, + translationstring_path, + } == handler.positive_range_request_paths + # Tries this first, finds that negative offsets are unsupported, so doesn't + # try it again. + assert {colander_wheel_path} == handler.negative_range_request_paths + # Two more for the first wheel, because it has the failing negative + # byte request and is larger than the initial chunk size. + assert handler.get_request_counts[colander_wheel_path] == 4 + assert handler.ok_response_counts[colander_wheel_path] == 2 + # The .dist-info dir at the start requires an additional ranged GET vs + # translationstring. + assert handler.get_request_counts[compile_wheel_path] == 3 + assert handler.ok_response_counts[compile_wheel_path] == 2 + # The entire file should have been pulled in with a single ranged GET. + assert handler.get_request_counts[has_script_path] == 2 + assert handler.ok_response_counts[has_script_path] == 2 + # The entire .dist-info dir should have been pulled in with a single + # ranged GET. The second GET is for the end of the download, pulling down + # the entire file contents. + assert handler.get_request_counts[translationstring_path] == 2 + assert handler.ok_response_counts[translationstring_path] == 2 + elif range_handler == RangeHandler.SupportsNegativeRange: + # The negative byte index worked, so no head requests. + assert not handler.head_request_paths + # The negative range request was in bounds and pulled in the entire + # .dist-info directory (at the end of the zip) for translationstring==0.1, + # so we didn't need another range request for it. compilewheel==2.0 has the + # .dist-info dir at the start of the zip, so we still need another request + # for that. + assert { + colander_wheel_path, + has_script_path, + compile_wheel_path, + } == handler.positive_range_request_paths + assert { + colander_wheel_path, + compile_wheel_path, + has_script_path, + translationstring_path, + } == handler.negative_range_request_paths + assert handler.get_request_counts[colander_wheel_path] == 3 + assert handler.ok_response_counts[colander_wheel_path] == 1 + # One more than translationstring, because the .dist-info dir is at the + # front of the wheel. + assert handler.get_request_counts[compile_wheel_path] == 3 + assert handler.ok_response_counts[compile_wheel_path] == 1 + # One more than NoNegativeRange, because the negative byte index failed. + assert handler.get_request_counts[has_script_path] == 3 + assert handler.ok_response_counts[has_script_path] == 1 + assert handler.get_request_counts[translationstring_path] == 2 + assert handler.ok_response_counts[translationstring_path] == 1 + else: + assert range_handler == RangeHandler.NegativeRangeOverflowing + # The negative byte index worked, so no head requests. + assert not handler.head_request_paths + # The negative range request was in bounds and pulled in the entire + # .dist-info directory (at the end of the zip) for translationstring==0.1, + # so we didn't need another range request for it. compilewheel==2.0 has the + # .dist-info dir at the start of the zip, so we still need another request + # for that. + assert { + colander_wheel_path, + compile_wheel_path, + } == handler.positive_range_request_paths + assert { + colander_wheel_path, + compile_wheel_path, + has_script_path, + translationstring_path, + has_script_path, + } == handler.negative_range_request_paths + assert handler.get_request_counts[colander_wheel_path] == 3 + assert handler.ok_response_counts[colander_wheel_path] == 1 + # One more than translationstring, because the .dist-info dir is at the + # front of the wheel. + assert handler.get_request_counts[compile_wheel_path] == 3 + assert handler.ok_response_counts[compile_wheel_path] == 1 + # One *less* request for has-script than SupportsNegativeRange, because the + # server returned a full 200 OK response when the negative byte range was + # larger than the actual file size. + assert handler.get_request_counts[has_script_path] == 2 + assert handler.ok_response_counts[has_script_path] == 2 + assert handler.get_request_counts[translationstring_path] == 2 + assert handler.ok_response_counts[translationstring_path] == 1 diff --git a/tests/lib/__init__.py b/tests/lib/__init__.py index e318f7155d2..df980cc01f1 100644 --- a/tests/lib/__init__.py +++ b/tests/lib/__init__.py @@ -1178,6 +1178,7 @@ def create_basic_wheel_for_package( extras: Optional[Dict[str, List[str]]] = None, requires_python: Optional[str] = None, extra_files: Optional[Dict[str, Union[bytes, str]]] = None, + metadata_first: bool = True, ) -> pathlib.Path: if depends is None: depends = [] @@ -1224,6 +1225,7 @@ def hello(): extra_files=extra_files, # Have an empty RECORD because we don't want to be checking hashes. record="", + metadata_first=metadata_first, ) wheel_builder.save_to(archive_path) diff --git a/tests/lib/wheel.py b/tests/lib/wheel.py index 342abbacaad..41098c60cb8 100644 --- a/tests/lib/wheel.py +++ b/tests/lib/wheel.py @@ -199,22 +199,19 @@ def record_file_maker_wrapper( version: str, files: Iterable[File], record: Defaulted[Optional[AnyStr]], -) -> Iterable[File]: - records: List[Record] = [] - for file in files: - records.append( - Record(file.name, digest(file.contents), str(len(file.contents))) - ) - yield file +) -> Optional[File]: + records: List[Record] = [ + Record(file.name, digest(file.contents), str(len(file.contents))) + for file in files + ] if record is None: - return + return None record_path = dist_info_path(name, version, "RECORD") if record is not _default: - yield File(record_path, ensure_binary(record)) - return + return File(record_path, ensure_binary(record)) records.append(Record(record_path, "", "")) @@ -224,7 +221,7 @@ def record_file_maker_wrapper( writer.writerow(r) contents = buf.getvalue().encode("utf-8") - yield File(record_path, contents) + return File(record_path, contents) def wheel_name( @@ -302,6 +299,7 @@ def make_wheel( console_scripts: Defaulted[List[str]] = _default, entry_points: Defaulted[Dict[str, List[str]]] = _default, record: Defaulted[Optional[AnyStr]] = _default, + metadata_first: bool = True, ) -> WheelBuilder: """ Helper function for generating test wheels which are compliant by default. @@ -362,13 +360,14 @@ def make_wheel( :param entry_points: :param record: if provided and None, then no RECORD file is generated; else if a string then sets the content of the RECORD file + :param metadata_first: Put the .dist-info metadata at the front of the zip file. """ pythons = ["py2", "py3"] abis = ["none"] platforms = ["any"] tags = list(itertools.product(pythons, abis, platforms)) - possible_files = [ + metadata_files = [ make_metadata_file(name, version, metadata, metadata_updates, metadata_body), make_wheel_metadata_file( name, version, wheel_metadata, tags, wheel_metadata_updates @@ -376,20 +375,31 @@ def make_wheel( make_entry_points_file(name, version, entry_points, console_scripts), ] + non_metadata_files = [] + if extra_files is not _default: - possible_files.extend(make_files(extra_files)) + non_metadata_files.extend(make_files(extra_files)) if extra_metadata_files is not _default: - possible_files.extend(make_metadata_files(name, version, extra_metadata_files)) + metadata_files.extend(make_metadata_files(name, version, extra_metadata_files)) if extra_data_files is not _default: - possible_files.extend(make_data_files(name, version, extra_data_files)) + non_metadata_files.extend(make_data_files(name, version, extra_data_files)) - actual_files = filter(None, possible_files) + actual_metadata_files = list(filter(None, metadata_files)) + + if metadata_first: + actual_files = actual_metadata_files + non_metadata_files + else: + actual_files = non_metadata_files + actual_metadata_files + + record_file = record_file_maker_wrapper(name, version, actual_files, record) + if record_file: + if metadata_first: + actual_files.insert(0, record_file) + else: + actual_files.append(record_file) - files_and_record_file = record_file_maker_wrapper( - name, version, actual_files, record - ) wheel_file_name = wheel_name(name, version, pythons, abis, platforms) - return WheelBuilder(wheel_file_name, files_and_record_file) + return WheelBuilder(wheel_file_name, actual_files)