From 95cfe930556f2b8a47cd7a0414e0d98714138b03 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 7 Aug 2023 01:43:52 -0400 Subject: [PATCH] rewrite some parts of lazy wheel --- src/pip/_internal/network/lazy_wheel.py | 69 +++++++++++-------------- src/pip/_internal/utils/wheel.py | 1 + 2 files changed, 30 insertions(+), 40 deletions(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 08301d3d92c..cd451c9227a 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -39,12 +39,10 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi is raised. """ try: - with LazyZipOverHTTP(url, session) as zf: - zf.prefetch_dist_info() - + with LazyHTTPFile(url, session) as lazy_file: # For read-only ZIP files, ZipFile only needs methods read, # seek, seekable and tell, not the whole IO protocol. - wheel = MemoryWheel(zf.name, zf) + wheel = MemoryWheel(lazy_file.name, lazy_file) # After context manager exit, wheel.name is an invalid file by intention. return get_wheel_distribution(wheel, canonicalize_name(name)) except (BadZipFile, UnsupportedWheel): @@ -147,7 +145,7 @@ def __next__(self) -> bytes: raise NotImplementedError -class LazyZipOverHTTP(ReadOnlyIOWrapper): +class LazyHTTPFile(ReadOnlyIOWrapper): """File-like object mapped to a ZIP file over HTTP. This uses HTTP range requests to lazily fetch the file's content, @@ -161,20 +159,30 @@ class LazyZipOverHTTP(ReadOnlyIOWrapper): _domains_without_negative_range: ClassVar[set[str]] = set() def __init__( - self, url: str, session: Session, chunk_size: int = CONTENT_CHUNK_SIZE + self, url: str, session: Session, initial_chunk_size: int = CONTENT_CHUNK_SIZE ) -> None: + # Add delete=False and print the file's `.name` to debug invalid virtual zips. super().__init__(cast(BinaryIO, NamedTemporaryFile())) self._request_count = 0 self._session = session self._url = url - self._chunk_size = chunk_size self._left: list[int] = [] self._right: list[int] = [] - self._length, initial_chunk = self._extract_content_length() + self._length, initial_chunk = self._extract_content_length(initial_chunk_size) self.truncate(self._length) - if initial_chunk is not None: + # The central directory for + # tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for + # a 459424488 byte file (about 486x as large). + self._minimum_fetch_granularity = max(initial_chunk_size, self._length // 400) + if initial_chunk is None: + # If we could not download any file contents yet (e.g. if negative byte + # ranges were not supported), then download all of this at once, hopefully + # pulling in the entire central directory. + initial_start = max(0, self._length - self._minimum_fetch_granularity) + self._download(initial_start, self._length) + else: self.seek(-len(initial_chunk), io.SEEK_END) self._file.write(initial_chunk) self._left.append(self._length - len(initial_chunk)) @@ -192,18 +200,20 @@ def read(self, size: int = -1) -> bytes: if size < 0: assert cur <= self._length download_size = self._length - cur + elif size == 0: + return b'' else: - download_size = max(size, self._chunk_size) + download_size = max(size, self._minimum_fetch_granularity) stop = min(cur + download_size, self._length) self._download(cur, stop - 1) return self._file.read(size) - def __enter__(self) -> LazyZipOverHTTP: + def __enter__(self) -> LazyHTTPFile: super().__enter__() return self def __exit__(self, *exc: Any) -> None: - logger.debug("requests for url %s: %s", self._url, self._request_count) + logger.debug("%d requests for url %s", self._request_count, self._url) super().__exit__(*exc) def _content_length_from_head(self) -> int: @@ -211,9 +221,6 @@ def _content_length_from_head(self) -> int: head = self._session.head(self._url, headers=HEADERS) head.raise_for_status() assert head.status_code == codes.ok - # S3 provides lowercased headers, and in the normal case these will return the - # same as 'Content-Length'. - # FIXME: provide documentation for this? return int(head.headers["content-length"]) @staticmethod @@ -222,10 +229,10 @@ def _parse_full_length_from_content_range(arg: str) -> Optional[int]: return int(m.group(1)) return None - def _try_initial_chunk_request(self) -> tuple[int, bytes]: + def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, bytes]: headers = HEADERS.copy() # Perform a negative range index, which is not supported by some servers. - headers["Range"] = f"bytes=-{self._chunk_size}" + headers["Range"] = f"bytes=-{initial_chunk_size}" # TODO: Get range requests to be correctly cached headers["Cache-Control"] = "no-cache" # TODO: If-Match (etag) to detect file changed during fetch would be a @@ -243,7 +250,7 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]: if code == codes.ok: # If this was done despite a smaller requested byte range, then we assume # the server does not support range requests. - if len(tail) > self._chunk_size: + if len(tail) > initial_chunk_size: raise HTTPRangeRequestUnsupported("returned complete file contents") elif code != codes.partial_content: raise HTTPRangeRequestUnsupported("did not receive partial content or ok") @@ -253,14 +260,14 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]: return (file_length, tail.content) raise HTTPRangeRequestUnsupported(f"could not parse content-range: {range_arg}") - def _extract_content_length(self) -> tuple[int, Optional[bytes]]: + def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optional[bytes]]: domain = urlparse(self._url).netloc if domain in self._domains_without_negative_range: return (self._content_length_from_head(), None) # Initial range request for just the end of the file. try: - return self._try_initial_chunk_request() + return self._try_initial_chunk_request(initial_chunk_size) except HTTPError as e: resp = e.response code = resp.status_code @@ -306,7 +313,7 @@ def _stay(self) -> Iterator[None]: def _check_zip(self) -> None: """Check and download until the file is a valid ZIP.""" end = self._length - 1 - for start in reversed(range(0, end, self._chunk_size)): + for start in reversed(range(0, end, CONTENT_CHUNK_SIZE)): self._download(start, end) with self._stay(): try: @@ -363,23 +370,5 @@ def _download(self, start: int, end: int) -> None: for start, end in self._merge(start, end, left, right): response = self._stream_response(start, end) self.seek(start) - for chunk in response.iter_content(self._chunk_size): + for chunk in response.iter_content(CONTENT_CHUNK_SIZE): self._file.write(chunk) - - def prefetch_dist_info(self) -> None: - """ - Read contents of entire dist-info section of wheel. - - pip wants to read WHEEL and METADATA. - """ - with self._stay(): - zf = ZipFile(self) - infolist = zf.infolist() - for info in infolist: - # should be (wheel filename without extension etc) + (.dist-info/) - if ".dist-info/" in info.filename: - start = info.header_offset - end = zf.start_dir - self.seek(start) - self.read(end - start) - break diff --git a/src/pip/_internal/utils/wheel.py b/src/pip/_internal/utils/wheel.py index e5e3f34ed81..a3359ea3ece 100644 --- a/src/pip/_internal/utils/wheel.py +++ b/src/pip/_internal/utils/wheel.py @@ -70,6 +70,7 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str: def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes: try: + logger.debug("extracting entry '%s' from zip '%s'", path, source.fp.name) return source.read(path) # BadZipFile for general corruption, KeyError for missing entry, # and RuntimeError for password-protected files