From d237759b9e568881429b76b898345e67c4aad04a Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 3 Oct 2022 09:31:23 -0400 Subject: [PATCH] lazier lazy_wheel - handle short files `416`; prefetch entire dist-info - lazy_wheel: translate BadZipfile to InvalidWheel - handle 501 on negative bytes range from pypi - catch UnsupportedWheel - make lazy wheel work against tensorflow-gpu - link to pypi discussion on negative byte ranges - check for case when server doesn't support byte ranges at all - remove _check_zip() method since we do that in prefetch_dist_info() now - clean up error handling code and don't warn when negative ranges fail - remove TODO, as Cache-Control: no-cache is the correct behavior - rearrange explanatory comments --- news/12208.feature.rst | 2 + src/pip/_internal/network/lazy_wheel.py | 374 +++++++++++++++++++----- src/pip/_internal/operations/prepare.py | 25 +- src/pip/_internal/utils/wheel.py | 4 + 4 files changed, 323 insertions(+), 82 deletions(-) create mode 100644 news/12208.feature.rst diff --git a/news/12208.feature.rst b/news/12208.feature.rst new file mode 100644 index 00000000000..a4c298505a5 --- /dev/null +++ b/news/12208.feature.rst @@ -0,0 +1,2 @@ +Implement lazier lazy_wheel that avoids the HEAD request, fetching the metadata +for many wheels in 1 request. diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 82ec50d5106..267069bde0b 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -1,26 +1,37 @@ """Lazy ZIP over HTTP""" -__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url"] +from __future__ import annotations +__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url", "LazyHTTPFile"] + +import io +import logging +import re from bisect import bisect_left, bisect_right from contextlib import contextmanager from tempfile import NamedTemporaryFile -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, BinaryIO, ClassVar, Iterable, Iterator, cast +from urllib.parse import urlparse from zipfile import BadZipFile, ZipFile from pip._vendor.packaging.utils import canonicalize_name -from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response +from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, HTTPError, Response +from pip._vendor.requests.status_codes import codes +from pip._internal.exceptions import InvalidWheel, UnsupportedWheel from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution -from pip._internal.network.session import PipSession -from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks +from pip._internal.network.session import PipSession as Session +from pip._internal.network.utils import HEADERS +from pip._internal.utils.logging import indent_log + +logger = logging.getLogger(__name__) class HTTPRangeRequestUnsupported(Exception): pass -def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistribution: +def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistribution: """Return a distribution object from the given wheel URL. This uses HTTP range requests to only fetch the portion of the wheel @@ -28,39 +39,27 @@ def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistrib If such requests are not supported, HTTPRangeRequestUnsupported is raised. """ - with LazyZipOverHTTP(url, session) as zf: - # For read-only ZIP files, ZipFile only needs methods read, - # seek, seekable and tell, not the whole IO protocol. - wheel = MemoryWheel(zf.name, zf) # type: ignore - # After context manager exit, wheel.name - # is an invalid file by intention. - return get_wheel_distribution(wheel, canonicalize_name(name)) + try: + # After context manager exit, wheel.name will point to a deleted file path. + # Add `delete_backing_file=False` to disable this for debugging. + with LazyHTTPFile(url, session) as lazy_file: + lazy_file.prefetch_contiguous_dist_info(name) + wheel = MemoryWheel(lazy_file.name, lazy_file) + return get_wheel_distribution(wheel, canonicalize_name(name)) + except (BadZipFile, UnsupportedWheel): + raise InvalidWheel(url, name) -class LazyZipOverHTTP: - """File-like object mapped to a ZIP file over HTTP. - This uses HTTP range requests to lazily fetch the file's content, - which is supposed to be fed to ZipFile. If such requests are not - supported by the server, raise HTTPRangeRequestUnsupported - during initialization. +class ReadOnlyIOWrapper(BinaryIO): + """Implement read-side ``BinaryIO`` methods wrapping an inner ``BinaryIO``. + + For read-only ZIP files, ``ZipFile`` only needs read, seek, seekable, and tell. + ``LazyHTTPFile`` subclasses this and therefore must only implement its lazy read(). """ - def __init__( - self, url: str, session: PipSession, chunk_size: int = CONTENT_CHUNK_SIZE - ) -> None: - head = session.head(url, headers=HEADERS) - raise_for_status(head) - assert head.status_code == 200 - self._session, self._url, self._chunk_size = session, url, chunk_size - self._length = int(head.headers["Content-Length"]) - self._file = NamedTemporaryFile() - self.truncate(self._length) - self._left: List[int] = [] - self._right: List[int] = [] - if "bytes" not in head.headers.get("Accept-Ranges", "none"): - raise HTTPRangeRequestUnsupported("range request is not supported") - self._check_zip() + def __init__(self, inner: BinaryIO) -> None: + self._file = inner @property def mode(self) -> str: @@ -85,24 +84,25 @@ def closed(self) -> bool: """Whether the file is closed.""" return self._file.closed - def read(self, size: int = -1) -> bytes: - """Read up to size bytes from the object and return them. + def fileno(self) -> int: + return self._file.fileno() - As a convenience, if size is unspecified or -1, - all bytes until EOF are returned. Fewer than - size bytes may be returned if EOF is reached. - """ - download_size = max(size, self._chunk_size) - start, length = self.tell(), self._length - stop = length if size < 0 else min(start + download_size, length) - start = max(0, stop - download_size) - self._download(start, stop - 1) - return self._file.read(size) + def flush(self) -> None: + self._file.flush() + + def isatty(self) -> bool: + return False def readable(self) -> bool: """Return whether the file is readable, which is True.""" return True + def readline(self, limit: int = -1) -> bytes: + raise NotImplementedError + + def readlines(self, hint: int = -1) -> list[bytes]: + raise NotImplementedError + def seek(self, offset: int, whence: int = 0) -> int: """Change stream position and return the new absolute position. @@ -117,7 +117,7 @@ def tell(self) -> int: """Return the current position.""" return self._file.tell() - def truncate(self, size: Optional[int] = None) -> int: + def truncate(self, size: int | None = None) -> int: """Resize the stream to the given size in bytes. If size is unspecified resize to the current position. @@ -131,15 +131,206 @@ def writable(self) -> bool: """Return False.""" return False - def __enter__(self) -> "LazyZipOverHTTP": + def write(self, s: bytes) -> int: + raise NotImplementedError + + def writelines(self, lines: Iterable[bytes]) -> None: + raise NotImplementedError + + def __enter__(self) -> ReadOnlyIOWrapper: self._file.__enter__() return self def __exit__(self, *exc: Any) -> None: self._file.__exit__(*exc) + def __iter__(self) -> Iterator[bytes]: + raise NotImplementedError + + def __next__(self) -> bytes: + raise NotImplementedError + + +# The central directory for tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is +# 944931 bytes, for a 459424488 byte file (about 486x as large). +_DEFAULT_INITIAL_FETCH = 1_000_000 + + +# The requests we perform in this file are intentionally small, and we don't want to +# guess at how the server will interpret caching directives for range requests (we +# especially don't want the server to return 200 OK with the entire file contents). +# TODO: consider If-Match (etag) and handling 304 File Modified, here or in the +# unpack_url() method for non-lazy downloads. +_UNCACHED_HEADERS = {**HEADERS, "Cache-Control": "no-cache"} + + +class LazyHTTPFile(ReadOnlyIOWrapper): + """File-like object mapped to a ZIP file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content, + which is supposed to be fed to ZipFile. If such requests are not + supported by the server, raise HTTPRangeRequestUnsupported + during initialization. + """ + + # Cache this on the type to avoid trying and failing our initial lazy wheel request + # multiple times in the same pip invocation against an index without this support. + _domains_without_negative_range: ClassVar[set[str]] = set() + + def __init__( + self, + url: str, + session: Session, + initial_chunk_size: int = _DEFAULT_INITIAL_FETCH, + delete_backing_file: bool = True, + ) -> None: + super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file))) + + self._request_count = 0 + self._session = session + self._url = url + self._left: list[int] = [] + self._right: list[int] = [] + + self._length, initial_chunk = self._extract_content_length(initial_chunk_size) + self.truncate(self._length) + if initial_chunk is None: + # If we could not download any file contents yet (e.g. if negative byte + # ranges were not supported), then download all of this at once, hopefully + # pulling in the entire central directory. + initial_start = max(0, self._length - initial_chunk_size) + self._download(initial_start, self._length) + else: + # If we *could* download some file contents, then write them to the end of + # the file and set up our bisect boundaries by hand. + with self._stay(): + self.seek(-len(initial_chunk), io.SEEK_END) + self._file.write(initial_chunk) + self._left.append(self._length - len(initial_chunk)) + self._right.append(self._length - 1) + + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + """ + cur = self.tell() + logger.debug("read size %d at %d", size, cur) + if size < 0: + assert cur <= self._length + download_size = self._length - cur + elif size == 0: + return b"" + else: + download_size = size + stop = min(cur + download_size, self._length) + self._download(cur, stop) + return self._file.read(size) + + def __enter__(self) -> LazyHTTPFile: + super().__enter__() + return self + + def __exit__(self, *exc: Any) -> None: + logger.debug("%d requests for url %s", self._request_count, self._url) + super().__exit__(*exc) + + def _content_length_from_head(self) -> int: + self._request_count += 1 + head = self._session.head(self._url, headers=_UNCACHED_HEADERS) + head.raise_for_status() + assert head.status_code == codes.ok + accepted_range = head.headers.get("Accept-Ranges", None) + if accepted_range != "bytes": + raise HTTPRangeRequestUnsupported( + f"server does not support byte ranges: header was '{accepted_range}'" + ) + return int(head.headers["Content-Length"]) + + @staticmethod + def _parse_full_length_from_content_range(arg: str) -> int: + # https://www.rfc-editor.org/rfc/rfc9110#field.content-range + m = re.match(r"bytes [^/]+/([0-9]+)", arg) + if m is None: + raise HTTPRangeRequestUnsupported(f"could not parse Content-Range: '{arg}'") + return int(m.group(1)) + + def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, bytes]: + headers = _UNCACHED_HEADERS.copy() + # Perform a negative range index, which is not supported by some servers. + headers["Range"] = f"bytes=-{initial_chunk_size}" + logger.debug("initial bytes request: %s", headers["Range"]) + + self._request_count += 1 + tail = self._session.get(self._url, headers=headers) + tail.raise_for_status() + + response_length = int(tail.headers["Content-Length"]) + assert response_length == len(tail.content) + + code = tail.status_code + # We've just received the entire file contents instead of any range at all. + if code == codes.ok: + # If this was done despite a smaller requested byte range, then we assume + # the server does not support range requests. + if len(tail) > initial_chunk_size: + raise HTTPRangeRequestUnsupported( + "returned complete file contents instead of range" + ) + elif code != codes.partial_content: + raise HTTPRangeRequestUnsupported( + "did not receive partial content or ok: got code {code}" + ) + + file_length = self._parse_full_length_from_content_range( + tail.headers["Content-Range"] + ) + return (file_length, tail.content) + + def _extract_content_length( + self, initial_chunk_size: int + ) -> tuple[int, bytes | None]: + domain = urlparse(self._url).netloc + if domain in self._domains_without_negative_range: + return (self._content_length_from_head(), None) + + # Initial range request for just the end of the file. + try: + return self._try_initial_chunk_request(initial_chunk_size) + except HTTPError as e: + resp = e.response + code = resp.status_code + # Our initial request using a negative byte range was not supported. + if code == codes.not_implemented: + # pypi notably does not support negative byte ranges: see + # https://github.com/pypi/warehouse/issues/12823. + logger.debug( + "Negative byte range not supported for domain '%s': " + "using HEAD request before lazy wheel from now on", + domain, + ) + # Avoid trying a negative byte range request against this domain for the + # rest of the resolve. + self._domains_without_negative_range.add(domain) + # Apply a HEAD request to get the real size, and nothing else for now. + return (self._content_length_from_head(), None) + # This indicates that the requested range from the end was larger than the + # actual file size: https://www.rfc-editor.org/rfc/rfc9110#status.416. + if code == codes.requested_range_not_satisfiable: + # In this case, we don't have any file content yet, but we do know the + # size the file will be, so we can return that and exit here. + file_length = self._parse_full_length_from_content_range( + resp.headers["Content-Range"] + ) + return (file_length, None) + # If we get some other error, then we expect that non-range requests will + # also fail, so we error out here and let the user figure it out. + raise + @contextmanager - def _stay(self) -> Generator[None, None, None]: + def _stay(self) -> Iterator[None]: """Return a context manager keeping the position. At the end of the block, seek back to original position. @@ -150,35 +341,20 @@ def _stay(self) -> Generator[None, None, None]: finally: self.seek(pos) - def _check_zip(self) -> None: - """Check and download until the file is a valid ZIP.""" - end = self._length - 1 - for start in reversed(range(0, end, self._chunk_size)): - self._download(start, end) - with self._stay(): - try: - # For read-only ZIP files, ZipFile only needs - # methods read, seek, seekable and tell. - ZipFile(self) # type: ignore - except BadZipFile: - pass - else: - break - - def _stream_response( - self, start: int, end: int, base_headers: Dict[str, str] = HEADERS - ) -> Response: - """Return HTTP response to a range request from start to end.""" - headers = base_headers.copy() + def _stream_response(self, start: int, end: int) -> Response: + """Return streaming HTTP response to a range request from start to end.""" + headers = _UNCACHED_HEADERS.copy() headers["Range"] = f"bytes={start}-{end}" - # TODO: Get range requests to be correctly cached - headers["Cache-Control"] = "no-cache" - return self._session.get(self._url, headers=headers, stream=True) + logger.debug("streamed bytes request: %s", headers["Range"]) + self._request_count += 1 + response = self._session.get(self._url, headers=headers, stream=True) + response.raise_for_status() + return response def _merge( self, start: int, end: int, left: int, right: int - ) -> Generator[Tuple[int, int], None, None]: - """Return a generator of intervals to be fetched. + ) -> Iterator[tuple[int, int]]: + """Return an iterator of intervals to be fetched. Args: start (int): Start of needed interval @@ -199,12 +375,58 @@ def _merge( def _download(self, start: int, end: int) -> None: """Download bytes from start to end inclusively.""" + # Reducing by 1 to get an inclusive end range. + end -= 1 with self._stay(): left = bisect_left(self._right, start) right = bisect_right(self._left, end) for start, end in self._merge(start, end, left, right): response = self._stream_response(start, end) - response.raise_for_status() self.seek(start) - for chunk in response_chunks(response, self._chunk_size): + for chunk in response.iter_content(CONTENT_CHUNK_SIZE): self._file.write(chunk) + + def prefetch_contiguous_dist_info(self, name: str) -> None: + """Read contents of entire dist-info section of wheel. + + We know pip will read every entry in this directory when generating a dist from + a wheel, so prepopulating the file contents avoids waiting for further + range requests. + """ + # Clarify in debug output which requests were sent during __init__, which during + # the prefetch, and which during the dist metadata generation. + with indent_log(): + logger.debug("begin prefetching dist-info for %s", name) + self._prefetch_contiguous_dist_info(name) + logger.debug("done prefetching dist-info for %s", name) + + def _prefetch_contiguous_dist_info(self, name: str) -> None: + dist_info_prefix = re.compile(r"^[^/]*\.dist-info/") + start: int | None = None + end: int | None = None + + # This may perform further requests if __init__() did not pull in the entire + # central directory at the end of the file (although _DEFAULT_INITIAL_FETCH + # should be set large enough to avoid this). + zf = ZipFile(self) + + for info in zf.infolist(): + if start is None: + if dist_info_prefix.search(info.filename): + start = info.header_offset + continue + else: + # The last .dist-info/ entry may be before the end of the file if the + # wheel's entries are sorted lexicographically (which is unusual). + if not dist_info_prefix.search(info.filename): + end = info.header_offset + break + if start is None: + raise UnsupportedWheel( + f"no {dist_info_prefix!r} directory found for {name} in {self.name}" + ) + # If the last entries of the zip are the .dist-info/ dir (as usual), then give + # us everything until the start of the central directory. + if end is None: + end = zf.start_dir + self._download(start, end) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index a21ee2b8752..ac8a374560c 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -7,7 +7,8 @@ import mimetypes import os import shutil -from typing import Dict, Iterable, List, Optional, Set +import urllib.parse +from typing import Dict, Iterable, List, Optional from pip._vendor.packaging.utils import canonicalize_name @@ -269,6 +270,9 @@ def __init__( # Previous "header" printed for a link-based InstallRequirement self._previous_requirement_header = ("", "") + # Whether an HTTPRangeRequestUnsupported was already thrown for this domain. + self._domains_without_http_range: set[str] = set() + def _log_preparing_link(self, req: InstallRequirement) -> None: """Provide context for the requirement being prepared.""" if req.link.is_file and not req.is_wheel_from_cache: @@ -440,6 +444,10 @@ def _fetch_metadata_using_lazy_wheel( ) return None + domain = urllib.parse.urlparse(link.url).netloc + if domain in self._domains_without_http_range: + return None + wheel = Wheel(link.filename) name = canonicalize_name(wheel.name) logger.info( @@ -447,11 +455,16 @@ def _fetch_metadata_using_lazy_wheel( name, wheel.version, ) - url = link.url.split("#", 1)[0] try: - return dist_from_wheel_url(name, url, self._session) - except HTTPRangeRequestUnsupported: - logger.debug("%s does not support range requests", url) + return dist_from_wheel_url(name, link.url_without_fragment, self._session) + except HTTPRangeRequestUnsupported as e: + logger.debug( + "domain '%s' does not support range requests (%s): " + "not attempting lazy wheel further", + domain, + str(e), + ) + self._domains_without_http_range.add(domain) return None def _complete_partial_requirements( @@ -472,7 +485,7 @@ def _complete_partial_requirements( assert req.link links_to_fully_download[req.link] = req - reqs_with_newly_unpacked_source_dirs: Set[Link] = set() + reqs_with_newly_unpacked_source_dirs: set[Link] = set() batch_download = self._batch_download( links_to_fully_download.keys(), diff --git a/src/pip/_internal/utils/wheel.py b/src/pip/_internal/utils/wheel.py index e5e3f34ed81..15b02ea3e46 100644 --- a/src/pip/_internal/utils/wheel.py +++ b/src/pip/_internal/utils/wheel.py @@ -10,6 +10,7 @@ from pip._vendor.packaging.utils import canonicalize_name from pip._internal.exceptions import UnsupportedWheel +from pip._internal.network.lazy_wheel import LazyHTTPFile VERSION_COMPATIBLE = (1, 0) @@ -69,6 +70,9 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str: def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes: + if isinstance(source.fp, LazyHTTPFile): + logger.debug("extracting entry '%s' from lazy zip '%s'", path, source.fp.name) + try: return source.read(path) # BadZipFile for general corruption, KeyError for missing entry,