From d237759b9e568881429b76b898345e67c4aad04a Mon Sep 17 00:00:00 2001
From: Daniel Holth <dholth@anaconda.com>
Date: Mon, 3 Oct 2022 09:31:23 -0400
Subject: [PATCH] lazier lazy_wheel

- handle short files `416`; prefetch entire dist-info
- lazy_wheel: translate BadZipfile to InvalidWheel
- handle 501 on negative bytes range from pypi
- catch UnsupportedWheel
- make lazy wheel work against tensorflow-gpu
- link to pypi discussion on negative byte ranges
- check for case when server doesn't support byte ranges at all
- remove _check_zip() method since we do that in prefetch_dist_info() now
- clean up error handling code and don't warn when negative ranges fail
- remove TODO, as Cache-Control: no-cache is the correct behavior
- rearrange explanatory comments
---
 news/12208.feature.rst                  |   2 +
 src/pip/_internal/network/lazy_wheel.py | 374 +++++++++++++++++++-----
 src/pip/_internal/operations/prepare.py |  25 +-
 src/pip/_internal/utils/wheel.py        |   4 +
 4 files changed, 323 insertions(+), 82 deletions(-)
 create mode 100644 news/12208.feature.rst

diff --git a/news/12208.feature.rst b/news/12208.feature.rst
new file mode 100644
index 00000000000..a4c298505a5
--- /dev/null
+++ b/news/12208.feature.rst
@@ -0,0 +1,2 @@
+Implement lazier lazy_wheel that avoids the HEAD request, fetching the metadata
+for many wheels in 1 request.
diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py
index 82ec50d5106..267069bde0b 100644
--- a/src/pip/_internal/network/lazy_wheel.py
+++ b/src/pip/_internal/network/lazy_wheel.py
@@ -1,26 +1,37 @@
 """Lazy ZIP over HTTP"""
 
-__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url"]
+from __future__ import annotations
 
+__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url", "LazyHTTPFile"]
+
+import io
+import logging
+import re
 from bisect import bisect_left, bisect_right
 from contextlib import contextmanager
 from tempfile import NamedTemporaryFile
-from typing import Any, Dict, Generator, List, Optional, Tuple
+from typing import Any, BinaryIO, ClassVar, Iterable, Iterator, cast
+from urllib.parse import urlparse
 from zipfile import BadZipFile, ZipFile
 
 from pip._vendor.packaging.utils import canonicalize_name
-from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
+from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, HTTPError, Response
+from pip._vendor.requests.status_codes import codes
 
+from pip._internal.exceptions import InvalidWheel, UnsupportedWheel
 from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution
-from pip._internal.network.session import PipSession
-from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
+from pip._internal.network.session import PipSession as Session
+from pip._internal.network.utils import HEADERS
+from pip._internal.utils.logging import indent_log
+
+logger = logging.getLogger(__name__)
 
 
 class HTTPRangeRequestUnsupported(Exception):
     pass
 
 
-def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistribution:
+def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistribution:
     """Return a distribution object from the given wheel URL.
 
     This uses HTTP range requests to only fetch the portion of the wheel
@@ -28,39 +39,27 @@ def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistrib
     If such requests are not supported, HTTPRangeRequestUnsupported
     is raised.
     """
-    with LazyZipOverHTTP(url, session) as zf:
-        # For read-only ZIP files, ZipFile only needs methods read,
-        # seek, seekable and tell, not the whole IO protocol.
-        wheel = MemoryWheel(zf.name, zf)  # type: ignore
-        # After context manager exit, wheel.name
-        # is an invalid file by intention.
-        return get_wheel_distribution(wheel, canonicalize_name(name))
+    try:
+        # After context manager exit, wheel.name will point to a deleted file path.
+        # Add `delete_backing_file=False` to disable this for debugging.
+        with LazyHTTPFile(url, session) as lazy_file:
+            lazy_file.prefetch_contiguous_dist_info(name)
 
+            wheel = MemoryWheel(lazy_file.name, lazy_file)
+            return get_wheel_distribution(wheel, canonicalize_name(name))
+    except (BadZipFile, UnsupportedWheel):
+        raise InvalidWheel(url, name)
 
-class LazyZipOverHTTP:
-    """File-like object mapped to a ZIP file over HTTP.
 
-    This uses HTTP range requests to lazily fetch the file's content,
-    which is supposed to be fed to ZipFile.  If such requests are not
-    supported by the server, raise HTTPRangeRequestUnsupported
-    during initialization.
+class ReadOnlyIOWrapper(BinaryIO):
+    """Implement read-side ``BinaryIO`` methods wrapping an inner ``BinaryIO``.
+
+    For read-only ZIP files, ``ZipFile`` only needs read, seek, seekable, and tell.
+    ``LazyHTTPFile`` subclasses this and therefore must only implement its lazy read().
     """
 
-    def __init__(
-        self, url: str, session: PipSession, chunk_size: int = CONTENT_CHUNK_SIZE
-    ) -> None:
-        head = session.head(url, headers=HEADERS)
-        raise_for_status(head)
-        assert head.status_code == 200
-        self._session, self._url, self._chunk_size = session, url, chunk_size
-        self._length = int(head.headers["Content-Length"])
-        self._file = NamedTemporaryFile()
-        self.truncate(self._length)
-        self._left: List[int] = []
-        self._right: List[int] = []
-        if "bytes" not in head.headers.get("Accept-Ranges", "none"):
-            raise HTTPRangeRequestUnsupported("range request is not supported")
-        self._check_zip()
+    def __init__(self, inner: BinaryIO) -> None:
+        self._file = inner
 
     @property
     def mode(self) -> str:
@@ -85,24 +84,25 @@ def closed(self) -> bool:
         """Whether the file is closed."""
         return self._file.closed
 
-    def read(self, size: int = -1) -> bytes:
-        """Read up to size bytes from the object and return them.
+    def fileno(self) -> int:
+        return self._file.fileno()
 
-        As a convenience, if size is unspecified or -1,
-        all bytes until EOF are returned.  Fewer than
-        size bytes may be returned if EOF is reached.
-        """
-        download_size = max(size, self._chunk_size)
-        start, length = self.tell(), self._length
-        stop = length if size < 0 else min(start + download_size, length)
-        start = max(0, stop - download_size)
-        self._download(start, stop - 1)
-        return self._file.read(size)
+    def flush(self) -> None:
+        self._file.flush()
+
+    def isatty(self) -> bool:
+        return False
 
     def readable(self) -> bool:
         """Return whether the file is readable, which is True."""
         return True
 
+    def readline(self, limit: int = -1) -> bytes:
+        raise NotImplementedError
+
+    def readlines(self, hint: int = -1) -> list[bytes]:
+        raise NotImplementedError
+
     def seek(self, offset: int, whence: int = 0) -> int:
         """Change stream position and return the new absolute position.
 
@@ -117,7 +117,7 @@ def tell(self) -> int:
         """Return the current position."""
         return self._file.tell()
 
-    def truncate(self, size: Optional[int] = None) -> int:
+    def truncate(self, size: int | None = None) -> int:
         """Resize the stream to the given size in bytes.
 
         If size is unspecified resize to the current position.
@@ -131,15 +131,206 @@ def writable(self) -> bool:
         """Return False."""
         return False
 
-    def __enter__(self) -> "LazyZipOverHTTP":
+    def write(self, s: bytes) -> int:
+        raise NotImplementedError
+
+    def writelines(self, lines: Iterable[bytes]) -> None:
+        raise NotImplementedError
+
+    def __enter__(self) -> ReadOnlyIOWrapper:
         self._file.__enter__()
         return self
 
     def __exit__(self, *exc: Any) -> None:
         self._file.__exit__(*exc)
 
+    def __iter__(self) -> Iterator[bytes]:
+        raise NotImplementedError
+
+    def __next__(self) -> bytes:
+        raise NotImplementedError
+
+
+# The central directory for tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is
+# 944931 bytes, for a 459424488 byte file (about 486x as large).
+_DEFAULT_INITIAL_FETCH = 1_000_000
+
+
+# The requests we perform in this file are intentionally small, and we don't want to
+# guess at how the server will interpret caching directives for range requests (we
+# especially don't want the server to return 200 OK with the entire file contents).
+# TODO: consider If-Match (etag) and handling 304 File Modified, here or in the
+#       unpack_url() method for non-lazy downloads.
+_UNCACHED_HEADERS = {**HEADERS, "Cache-Control": "no-cache"}
+
+
+class LazyHTTPFile(ReadOnlyIOWrapper):
+    """File-like object mapped to a ZIP file over HTTP.
+
+    This uses HTTP range requests to lazily fetch the file's content,
+    which is supposed to be fed to ZipFile.  If such requests are not
+    supported by the server, raise HTTPRangeRequestUnsupported
+    during initialization.
+    """
+
+    # Cache this on the type to avoid trying and failing our initial lazy wheel request
+    # multiple times in the same pip invocation against an index without this support.
+    _domains_without_negative_range: ClassVar[set[str]] = set()
+
+    def __init__(
+        self,
+        url: str,
+        session: Session,
+        initial_chunk_size: int = _DEFAULT_INITIAL_FETCH,
+        delete_backing_file: bool = True,
+    ) -> None:
+        super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file)))
+
+        self._request_count = 0
+        self._session = session
+        self._url = url
+        self._left: list[int] = []
+        self._right: list[int] = []
+
+        self._length, initial_chunk = self._extract_content_length(initial_chunk_size)
+        self.truncate(self._length)
+        if initial_chunk is None:
+            # If we could not download any file contents yet (e.g. if negative byte
+            # ranges were not supported), then download all of this at once, hopefully
+            # pulling in the entire central directory.
+            initial_start = max(0, self._length - initial_chunk_size)
+            self._download(initial_start, self._length)
+        else:
+            # If we *could* download some file contents, then write them to the end of
+            # the file and set up our bisect boundaries by hand.
+            with self._stay():
+                self.seek(-len(initial_chunk), io.SEEK_END)
+                self._file.write(initial_chunk)
+                self._left.append(self._length - len(initial_chunk))
+                self._right.append(self._length - 1)
+
+    def read(self, size: int = -1) -> bytes:
+        """Read up to size bytes from the object and return them.
+
+        As a convenience, if size is unspecified or -1,
+        all bytes until EOF are returned.  Fewer than
+        size bytes may be returned if EOF is reached.
+        """
+        cur = self.tell()
+        logger.debug("read size %d at %d", size, cur)
+        if size < 0:
+            assert cur <= self._length
+            download_size = self._length - cur
+        elif size == 0:
+            return b""
+        else:
+            download_size = size
+        stop = min(cur + download_size, self._length)
+        self._download(cur, stop)
+        return self._file.read(size)
+
+    def __enter__(self) -> LazyHTTPFile:
+        super().__enter__()
+        return self
+
+    def __exit__(self, *exc: Any) -> None:
+        logger.debug("%d requests for url %s", self._request_count, self._url)
+        super().__exit__(*exc)
+
+    def _content_length_from_head(self) -> int:
+        self._request_count += 1
+        head = self._session.head(self._url, headers=_UNCACHED_HEADERS)
+        head.raise_for_status()
+        assert head.status_code == codes.ok
+        accepted_range = head.headers.get("Accept-Ranges", None)
+        if accepted_range != "bytes":
+            raise HTTPRangeRequestUnsupported(
+                f"server does not support byte ranges: header was '{accepted_range}'"
+            )
+        return int(head.headers["Content-Length"])
+
+    @staticmethod
+    def _parse_full_length_from_content_range(arg: str) -> int:
+        # https://www.rfc-editor.org/rfc/rfc9110#field.content-range
+        m = re.match(r"bytes [^/]+/([0-9]+)", arg)
+        if m is None:
+            raise HTTPRangeRequestUnsupported(f"could not parse Content-Range: '{arg}'")
+        return int(m.group(1))
+
+    def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, bytes]:
+        headers = _UNCACHED_HEADERS.copy()
+        # Perform a negative range index, which is not supported by some servers.
+        headers["Range"] = f"bytes=-{initial_chunk_size}"
+        logger.debug("initial bytes request: %s", headers["Range"])
+
+        self._request_count += 1
+        tail = self._session.get(self._url, headers=headers)
+        tail.raise_for_status()
+
+        response_length = int(tail.headers["Content-Length"])
+        assert response_length == len(tail.content)
+
+        code = tail.status_code
+        # We've just received the entire file contents instead of any range at all.
+        if code == codes.ok:
+            # If this was done despite a smaller requested byte range, then we assume
+            # the server does not support range requests.
+            if len(tail) > initial_chunk_size:
+                raise HTTPRangeRequestUnsupported(
+                    "returned complete file contents instead of range"
+                )
+        elif code != codes.partial_content:
+            raise HTTPRangeRequestUnsupported(
+                "did not receive partial content or ok: got code {code}"
+            )
+
+        file_length = self._parse_full_length_from_content_range(
+            tail.headers["Content-Range"]
+        )
+        return (file_length, tail.content)
+
+    def _extract_content_length(
+        self, initial_chunk_size: int
+    ) -> tuple[int, bytes | None]:
+        domain = urlparse(self._url).netloc
+        if domain in self._domains_without_negative_range:
+            return (self._content_length_from_head(), None)
+
+        # Initial range request for just the end of the file.
+        try:
+            return self._try_initial_chunk_request(initial_chunk_size)
+        except HTTPError as e:
+            resp = e.response
+            code = resp.status_code
+            # Our initial request using a negative byte range was not supported.
+            if code == codes.not_implemented:
+                # pypi notably does not support negative byte ranges: see
+                # https://github.com/pypi/warehouse/issues/12823.
+                logger.debug(
+                    "Negative byte range not supported for domain '%s': "
+                    "using HEAD request before lazy wheel from now on",
+                    domain,
+                )
+                # Avoid trying a negative byte range request against this domain for the
+                # rest of the resolve.
+                self._domains_without_negative_range.add(domain)
+                # Apply a HEAD request to get the real size, and nothing else for now.
+                return (self._content_length_from_head(), None)
+            # This indicates that the requested range from the end was larger than the
+            # actual file size: https://www.rfc-editor.org/rfc/rfc9110#status.416.
+            if code == codes.requested_range_not_satisfiable:
+                # In this case, we don't have any file content yet, but we do know the
+                # size the file will be, so we can return that and exit here.
+                file_length = self._parse_full_length_from_content_range(
+                    resp.headers["Content-Range"]
+                )
+                return (file_length, None)
+            # If we get some other error, then we expect that non-range requests will
+            # also fail, so we error out here and let the user figure it out.
+            raise
+
     @contextmanager
-    def _stay(self) -> Generator[None, None, None]:
+    def _stay(self) -> Iterator[None]:
         """Return a context manager keeping the position.
 
         At the end of the block, seek back to original position.
@@ -150,35 +341,20 @@ def _stay(self) -> Generator[None, None, None]:
         finally:
             self.seek(pos)
 
-    def _check_zip(self) -> None:
-        """Check and download until the file is a valid ZIP."""
-        end = self._length - 1
-        for start in reversed(range(0, end, self._chunk_size)):
-            self._download(start, end)
-            with self._stay():
-                try:
-                    # For read-only ZIP files, ZipFile only needs
-                    # methods read, seek, seekable and tell.
-                    ZipFile(self)  # type: ignore
-                except BadZipFile:
-                    pass
-                else:
-                    break
-
-    def _stream_response(
-        self, start: int, end: int, base_headers: Dict[str, str] = HEADERS
-    ) -> Response:
-        """Return HTTP response to a range request from start to end."""
-        headers = base_headers.copy()
+    def _stream_response(self, start: int, end: int) -> Response:
+        """Return streaming HTTP response to a range request from start to end."""
+        headers = _UNCACHED_HEADERS.copy()
         headers["Range"] = f"bytes={start}-{end}"
-        # TODO: Get range requests to be correctly cached
-        headers["Cache-Control"] = "no-cache"
-        return self._session.get(self._url, headers=headers, stream=True)
+        logger.debug("streamed bytes request: %s", headers["Range"])
+        self._request_count += 1
+        response = self._session.get(self._url, headers=headers, stream=True)
+        response.raise_for_status()
+        return response
 
     def _merge(
         self, start: int, end: int, left: int, right: int
-    ) -> Generator[Tuple[int, int], None, None]:
-        """Return a generator of intervals to be fetched.
+    ) -> Iterator[tuple[int, int]]:
+        """Return an iterator of intervals to be fetched.
 
         Args:
             start (int): Start of needed interval
@@ -199,12 +375,58 @@ def _merge(
 
     def _download(self, start: int, end: int) -> None:
         """Download bytes from start to end inclusively."""
+        # Reducing by 1 to get an inclusive end range.
+        end -= 1
         with self._stay():
             left = bisect_left(self._right, start)
             right = bisect_right(self._left, end)
             for start, end in self._merge(start, end, left, right):
                 response = self._stream_response(start, end)
-                response.raise_for_status()
                 self.seek(start)
-                for chunk in response_chunks(response, self._chunk_size):
+                for chunk in response.iter_content(CONTENT_CHUNK_SIZE):
                     self._file.write(chunk)
+
+    def prefetch_contiguous_dist_info(self, name: str) -> None:
+        """Read contents of entire dist-info section of wheel.
+
+        We know pip will read every entry in this directory when generating a dist from
+        a wheel, so prepopulating the file contents avoids waiting for further
+        range requests.
+        """
+        # Clarify in debug output which requests were sent during __init__, which during
+        # the prefetch, and which during the dist metadata generation.
+        with indent_log():
+            logger.debug("begin prefetching dist-info for %s", name)
+            self._prefetch_contiguous_dist_info(name)
+            logger.debug("done prefetching dist-info for %s", name)
+
+    def _prefetch_contiguous_dist_info(self, name: str) -> None:
+        dist_info_prefix = re.compile(r"^[^/]*\.dist-info/")
+        start: int | None = None
+        end: int | None = None
+
+        # This may perform further requests if __init__() did not pull in the entire
+        # central directory at the end of the file (although _DEFAULT_INITIAL_FETCH
+        # should be set large enough to avoid this).
+        zf = ZipFile(self)
+
+        for info in zf.infolist():
+            if start is None:
+                if dist_info_prefix.search(info.filename):
+                    start = info.header_offset
+                    continue
+            else:
+                # The last .dist-info/ entry may be before the end of the file if the
+                # wheel's entries are sorted lexicographically (which is unusual).
+                if not dist_info_prefix.search(info.filename):
+                    end = info.header_offset
+                    break
+        if start is None:
+            raise UnsupportedWheel(
+                f"no {dist_info_prefix!r} directory found for {name} in {self.name}"
+            )
+        # If the last entries of the zip are the .dist-info/ dir (as usual), then give
+        # us everything until the start of the central directory.
+        if end is None:
+            end = zf.start_dir
+        self._download(start, end)
diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py
index a21ee2b8752..ac8a374560c 100644
--- a/src/pip/_internal/operations/prepare.py
+++ b/src/pip/_internal/operations/prepare.py
@@ -7,7 +7,8 @@
 import mimetypes
 import os
 import shutil
-from typing import Dict, Iterable, List, Optional, Set
+import urllib.parse
+from typing import Dict, Iterable, List, Optional
 
 from pip._vendor.packaging.utils import canonicalize_name
 
@@ -269,6 +270,9 @@ def __init__(
         # Previous "header" printed for a link-based InstallRequirement
         self._previous_requirement_header = ("", "")
 
+        # Whether an HTTPRangeRequestUnsupported was already thrown for this domain.
+        self._domains_without_http_range: set[str] = set()
+
     def _log_preparing_link(self, req: InstallRequirement) -> None:
         """Provide context for the requirement being prepared."""
         if req.link.is_file and not req.is_wheel_from_cache:
@@ -440,6 +444,10 @@ def _fetch_metadata_using_lazy_wheel(
             )
             return None
 
+        domain = urllib.parse.urlparse(link.url).netloc
+        if domain in self._domains_without_http_range:
+            return None
+
         wheel = Wheel(link.filename)
         name = canonicalize_name(wheel.name)
         logger.info(
@@ -447,11 +455,16 @@ def _fetch_metadata_using_lazy_wheel(
             name,
             wheel.version,
         )
-        url = link.url.split("#", 1)[0]
         try:
-            return dist_from_wheel_url(name, url, self._session)
-        except HTTPRangeRequestUnsupported:
-            logger.debug("%s does not support range requests", url)
+            return dist_from_wheel_url(name, link.url_without_fragment, self._session)
+        except HTTPRangeRequestUnsupported as e:
+            logger.debug(
+                "domain '%s' does not support range requests (%s): "
+                "not attempting lazy wheel further",
+                domain,
+                str(e),
+            )
+            self._domains_without_http_range.add(domain)
             return None
 
     def _complete_partial_requirements(
@@ -472,7 +485,7 @@ def _complete_partial_requirements(
             assert req.link
             links_to_fully_download[req.link] = req
 
-        reqs_with_newly_unpacked_source_dirs: Set[Link] = set()
+        reqs_with_newly_unpacked_source_dirs: set[Link] = set()
 
         batch_download = self._batch_download(
             links_to_fully_download.keys(),
diff --git a/src/pip/_internal/utils/wheel.py b/src/pip/_internal/utils/wheel.py
index e5e3f34ed81..15b02ea3e46 100644
--- a/src/pip/_internal/utils/wheel.py
+++ b/src/pip/_internal/utils/wheel.py
@@ -10,6 +10,7 @@
 from pip._vendor.packaging.utils import canonicalize_name
 
 from pip._internal.exceptions import UnsupportedWheel
+from pip._internal.network.lazy_wheel import LazyHTTPFile
 
 VERSION_COMPATIBLE = (1, 0)
 
@@ -69,6 +70,9 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str:
 
 
 def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes:
+    if isinstance(source.fp, LazyHTTPFile):
+        logger.debug("extracting entry '%s' from lazy zip '%s'", path, source.fp.name)
+
     try:
         return source.read(path)
         # BadZipFile for general corruption, KeyError for missing entry,