From df6203538b64787a78ab056a13e9e5c096606c91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 16 Dec 2023 12:08:14 +0100 Subject: [PATCH 1/9] feature: add new "lazy-wheel" config option If active and a server supports range requests, we do not have to download entire wheels to fetch metadata but can just download the METADATA file from the remote wheel with a couple of range requests. --- docs/configuration.md | 17 + src/poetry/config/config.py | 4 + src/poetry/console/commands/config.py | 3 +- src/poetry/inspection/info.py | 22 +- src/poetry/inspection/lazy_wheel.py | 762 ++++++++++++++++++ src/poetry/repositories/http_repository.py | 70 +- src/poetry/utils/authenticator.py | 4 + src/poetry/utils/helpers.py | 13 + tests/console/commands/test_config.py | 25 + ...ssing_dist_info-0.1.0-py2.py3-none-any.whl | Bin 0 -> 310 bytes tests/helpers.py | 10 +- tests/inspection/conftest.py | 148 ++++ tests/inspection/test_info.py | 24 + tests/inspection/test_lazy_wheel.py | 188 +++++ tests/repositories/test_http_repository.py | 144 ++++ tests/repositories/test_legacy_repository.py | 5 +- tests/repositories/test_pypi_repository.py | 5 +- .../test_single_page_repository.py | 5 +- tests/utils/test_helpers.py | 35 + 19 files changed, 1466 insertions(+), 18 deletions(-) create mode 100644 src/poetry/inspection/lazy_wheel.py create mode 100644 tests/fixtures/distributions/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl create mode 100644 tests/inspection/conftest.py create mode 100644 tests/inspection/test_lazy_wheel.py create mode 100644 tests/repositories/test_http_repository.py diff --git a/docs/configuration.md b/docs/configuration.md index 384140b4162..67a0301c3cd 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -271,6 +271,23 @@ across all your projects if incorrectly set. Use parallel execution when using the new (`>=1.1.0`) installer. +### `solver.lazy-wheel` + +**Type**: `boolean` + +**Default**: `true` + +**Environment Variable**: `POETRY_SOLVER_LAZY_WHEEL` + +*Introduced in 1.8.0* + +Do not download entire wheels to extract metadata but use +[HTTP range requests](https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests) +to only download the METADATA files of wheels. +Especially with slow network connections this setting can speed up dependency resolution significantly. +If the cache has already been filled or the server does not support HTTP range requests, +this setting makes no difference. + ### `virtualenvs.create` **Type**: `boolean` diff --git a/src/poetry/config/config.py b/src/poetry/config/config.py index 9f0ecaec792..5bafb90218b 100644 --- a/src/poetry/config/config.py +++ b/src/poetry/config/config.py @@ -131,6 +131,9 @@ class Config: "max-workers": None, "no-binary": None, }, + "solver": { + "lazy-wheel": True, + }, "warnings": { "export": True, }, @@ -296,6 +299,7 @@ def _get_normalizer(name: str) -> Callable[[str], Any]: "experimental.system-git-client", "installer.modern-installation", "installer.parallel", + "solver.lazy-wheel", "warnings.export", }: return boolean_normalizer diff --git a/src/poetry/console/commands/config.py b/src/poetry/console/commands/config.py index 3b460fb7aa2..ca5880a01e2 100644 --- a/src/poetry/console/commands/config.py +++ b/src/poetry/console/commands/config.py @@ -68,15 +68,16 @@ def unique_config_values(self) -> dict[str, tuple[Any, Any]]: ), "virtualenvs.path": (str, lambda val: str(Path(val))), "virtualenvs.prefer-active-python": (boolean_validator, boolean_normalizer), + "virtualenvs.prompt": (str, lambda val: str(val)), "experimental.system-git-client": (boolean_validator, boolean_normalizer), "installer.modern-installation": (boolean_validator, boolean_normalizer), "installer.parallel": (boolean_validator, boolean_normalizer), "installer.max-workers": (lambda val: int(val) > 0, int_normalizer), - "virtualenvs.prompt": (str, lambda val: str(val)), "installer.no-binary": ( PackageFilterPolicy.validator, PackageFilterPolicy.normalize, ), + "solver.lazy-wheel": (boolean_validator, boolean_normalizer), "warnings.export": (boolean_validator, boolean_normalizer), } diff --git a/src/poetry/inspection/info.py b/src/poetry/inspection/info.py index 8ca3e2fe975..9afb4612b37 100644 --- a/src/poetry/inspection/info.py +++ b/src/poetry/inspection/info.py @@ -32,6 +32,8 @@ from poetry.core.packages.project_package import ProjectPackage + from poetry.inspection.lazy_wheel import MemoryWheel + logger = logging.getLogger(__name__) @@ -229,9 +231,7 @@ def to_package( return package @classmethod - def _from_distribution( - cls, dist: pkginfo.BDist | pkginfo.SDist | pkginfo.Wheel - ) -> PackageInfo: + def _from_distribution(cls, dist: pkginfo.Distribution) -> PackageInfo: """ Helper method to parse package information from a `pkginfo.Distribution` instance. @@ -242,7 +242,7 @@ def _from_distribution( if dist.requires_dist: requirements = list(dist.requires_dist) - else: + elif isinstance(dist, (pkginfo.BDist, pkginfo.SDist, pkginfo.Wheel)): requires = Path(dist.filename) / "requires.txt" if requires.exists(): text = requires.read_text(encoding="utf-8") @@ -256,8 +256,9 @@ def _from_distribution( requires_python=dist.requires_python, ) - info._source_type = "file" - info._source_url = Path(dist.filename).resolve().as_posix() + if isinstance(dist, (pkginfo.BDist, pkginfo.SDist, pkginfo.Wheel)): + info._source_type = "file" + info._source_url = Path(dist.filename).resolve().as_posix() return info @@ -520,6 +521,15 @@ def from_wheel(cls, path: Path) -> PackageInfo: except ValueError: return PackageInfo() + @classmethod + def from_memory_wheel(cls, wheel: MemoryWheel) -> PackageInfo: + """ + Gather package information from a partial fetched wheel kept in memory. + + :param path: Path to wheel. + """ + return cls._from_distribution(wheel) + @classmethod def from_bdist(cls, path: Path) -> PackageInfo: """ diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py new file mode 100644 index 00000000000..7542ed3abe0 --- /dev/null +++ b/src/poetry/inspection/lazy_wheel.py @@ -0,0 +1,762 @@ +"""Lazy ZIP over HTTP""" + +from __future__ import annotations + +import abc +import io +import logging +import re + +from bisect import bisect_left +from bisect import bisect_right +from contextlib import contextmanager +from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING +from typing import Any +from typing import BinaryIO +from typing import ClassVar +from typing import cast +from urllib.parse import urlparse +from zipfile import BadZipFile +from zipfile import ZipFile + +from pkginfo import Distribution +from requests.models import CONTENT_CHUNK_SIZE +from requests.models import HTTPError +from requests.models import Response +from requests.status_codes import codes + + +if TYPE_CHECKING: + from collections.abc import Iterable + from collections.abc import Iterator + + from requests import Session + + from poetry.utils.authenticator import Authenticator + + +logger = logging.getLogger(__name__) + + +class HTTPRangeRequestUnsupported(Exception): + """Raised when the remote server appears unable to support byte ranges.""" + + +class UnsupportedWheel(Exception): + """Unsupported wheel.""" + + +class InvalidWheel(Exception): + """Invalid (e.g. corrupt) wheel.""" + + def __init__(self, location: str, name: str): + self.location = location + self.name = name + + def __str__(self) -> str: + return f"Wheel '{self.name}' located at {self.location} is invalid." + + +class MemoryWheel(Distribution): + def __init__(self, lazy_file: LazyWheelOverHTTP) -> None: + self.lazy_file = lazy_file + self.extractMetadata() + + def read(self) -> bytes: + with ZipFile(self.lazy_file) as archive: + tuples = [x.split("/") for x in archive.namelist() if "METADATA" in x] + schwarz = sorted([(len(x), x) for x in tuples]) + for path in [x[1] for x in schwarz]: + candidate = "/".join(path) + logger.debug(f"read {candidate}") + data = archive.read(candidate) + if b"Metadata-Version" in data: + return data + else: + raise ValueError(f"No METADATA in archive: {self.lazy_file.name}") + + +def memory_wheel_from_url( + name: str, url: str, session: Session | Authenticator +) -> MemoryWheel: + """Return a MemoryWheel (compatible to pkginfo.Wheel) from the given wheel URL. + + This uses HTTP range requests to only fetch the portion of the wheel + containing metadata, just enough for the object to be constructed. + + :raises HTTPRangeRequestUnsupported: if range requests are unsupported for ``url``. + :raises InvalidWheel: if the zip file contents could not be parsed. + """ + try: + # After context manager exit, wheel.name will point to a deleted file path. + # Add `delete_backing_file=False` to disable this for debugging. + with LazyWheelOverHTTP(url, session) as lazy_file: + # prefetch metadata to reduce the number of range requests + # (we know that METADATA is the only file from the wheel we need) + lazy_file.prefetch_metadata(name) + return MemoryWheel(lazy_file) + + except (BadZipFile, UnsupportedWheel): + # We assume that these errors have occurred because the wheel contents + # themselves are invalid, not because we've messed up our bookkeeping + # and produced an invalid file. + raise InvalidWheel(url, name) + + +class MergeIntervals: + """Stateful bookkeeping to merge interval graphs.""" + + def __init__(self, *, left: Iterable[int] = (), right: Iterable[int] = ()) -> None: + self._left = list(left) + self._right = list(right) + + def __repr__(self) -> str: + return ( + f"{type(self).__name__}" + f"(left={tuple(self._left)}, right={tuple(self._right)})" + ) + + def _merge( + self, start: int, end: int, left: int, right: int + ) -> Iterator[tuple[int, int]]: + """Return an iterator of intervals to be fetched. + + Args: + start (int): Start of needed interval + end (int): End of needed interval + left (int): Index of first overlapping downloaded data + right (int): Index after last overlapping downloaded data + """ + lslice, rslice = self._left[left:right], self._right[left:right] + i = start = min([start] + lslice[:1]) + end = max([end] + rslice[-1:]) + for j, k in zip(lslice, rslice): + if j > i: + yield i, j - 1 + i = k + 1 + if i <= end: + yield i, end + self._left[left:right], self._right[left:right] = [start], [end] + + def minimal_intervals_covering( + self, start: int, end: int + ) -> Iterator[tuple[int, int]]: + """Provide the intervals needed to cover from ``start <= x <= end``. + + This method mutates internal state so that later calls only return intervals not + covered by prior calls. The first call to this method will always return exactly + one interval, which was exactly the one requested. Later requests for + intervals overlapping that first requested interval will yield only the ranges + not previously covered (which may be empty, e.g. if the same interval is + requested twice). + + This may be used e.g. to download substrings of remote files on demand. + """ + left = bisect_left(self._right, start) + right = bisect_right(self._left, end) + yield from self._merge(start, end, left, right) + + +class ReadOnlyIOWrapper(BinaryIO): + """Implement read-side ``BinaryIO`` methods wrapping an inner ``BinaryIO``. + + This wrapper is useful because Python currently does not distinguish read-only + streams at the type level. + """ + + def __init__(self, inner: BinaryIO) -> None: + self._file = inner + + @property + def mode(self) -> str: + """Opening mode, which is always rb.""" + return "rb" + + @property + def name(self) -> str: + """Path to the underlying file.""" + return self._file.name + + def seekable(self) -> bool: + """Return whether random access is supported, which is True.""" + return True + + def close(self) -> None: + """Close the file.""" + self._file.close() + + @property + def closed(self) -> bool: + """Whether the file is closed.""" + return self._file.closed + + def fileno(self) -> int: + return self._file.fileno() + + def flush(self) -> None: + self._file.flush() + + def isatty(self) -> bool: + return False + + def readable(self) -> bool: + """Return whether the file is readable, which is True.""" + return True + + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + """ + return self._file.read(size) + + def readline(self, limit: int = -1) -> bytes: + # Explicit impl needed to satisfy mypy. + raise NotImplementedError + + def readlines(self, hint: int = -1) -> list[bytes]: + raise NotImplementedError + + def seek(self, offset: int, whence: int = 0) -> int: + """Change stream position and return the new absolute position. + + Seek to offset relative position indicated by whence: + * 0: Start of stream (the default). pos should be >= 0; + * 1: Current position - pos may be negative; + * 2: End of stream - pos usually negative. + """ + return self._file.seek(offset, whence) + + def tell(self) -> int: + """Return the current position.""" + return self._file.tell() + + def truncate(self, size: int | None = None) -> int: + """Resize the stream to the given size in bytes. + + If size is unspecified resize to the current position. + The current stream position isn't changed. + + Return the new file size. + """ + return self._file.truncate(size) + + def writable(self) -> bool: + """Return False.""" + return False + + def write(self, s: Any) -> int: + raise NotImplementedError + + def writelines(self, lines: Iterable[Any]) -> None: + raise NotImplementedError + + def __enter__(self) -> ReadOnlyIOWrapper: + self._file.__enter__() + return self + + def __exit__(self, *exc: Any) -> None: + self._file.__exit__(*exc) + + def __iter__(self) -> Iterator[bytes]: + raise NotImplementedError + + def __next__(self) -> bytes: + raise NotImplementedError + + +class LazyRemoteResource(ReadOnlyIOWrapper): + """Abstract class for a binary file object that lazily downloads its contents. + + This class uses a ``MergeIntervals`` instance to keep track of what it's downloaded. + """ + + def __init__(self, inner: BinaryIO) -> None: + super().__init__(inner) + self._merge_intervals: MergeIntervals | None = None + + @abc.abstractmethod + def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Call to the remote backend to provide this byte range in one or more chunks. + + NB: For compatibility with HTTP range requests, the range provided to this + method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2 + bytes long, and the range can never be empty). + + :raises Exception: implementations may raise an exception for e.g. intermittent + errors accessing the remote resource. + """ + ... + + def _setup_content(self) -> None: + """Ensure ``self._merge_intervals`` is initialized. + + Called in ``__enter__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is None: + self._merge_intervals = MergeIntervals() + + def _reset_content(self) -> None: + """Unset ``self._merge_intervals``. + + Called in ``__exit__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is not None: + logger.debug( + "unsetting merge intervals (were: %s)", repr(self._merge_intervals) + ) + self._merge_intervals = None + + def __enter__(self) -> LazyRemoteResource: + """Call ``self._setup_content()``, then return self.""" + super().__enter__() + self._setup_content() + return self + + def __exit__(self, *exc: Any) -> None: + """Call ``self._reset_content()``.""" + self._reset_content() + super().__exit__(*exc) + + @contextmanager + def _stay(self) -> Iterator[None]: + """Return a context manager keeping the position. + + At the end of the block, seek back to original position. + """ + pos = self.tell() + try: + yield + finally: + self.seek(pos) + + def ensure_downloaded(self, start: int, end: int) -> None: + """Ensures bytes start to end (inclusive) have been downloaded and written to + the backing file. + + :raises ValueError: if ``__enter__`` was not called beforehand. + """ + if self._merge_intervals is None: + raise ValueError(".__enter__() must be called to set up merge intervals") + # Reducing by 1 to get an inclusive end range. + end -= 1 + with self._stay(): + for ( + range_start, + range_end, + ) in self._merge_intervals.minimal_intervals_covering(start, end): + self.seek(start) + for chunk in self.fetch_content_range(range_start, range_end): + self._file.write(chunk) + + +class FixedSizeLazyResource(LazyRemoteResource): + """Fetches a fixed content length from the remote server when initialized in order + to support ``.read(-1)`` and seek calls against ``io.SEEK_END``.""" + + def __init__(self, inner: BinaryIO) -> None: + super().__init__(inner) + self._length: int | None = None + + @abc.abstractmethod + def _fetch_content_length(self) -> int: + """Query the remote resource for the total length of the file. + + This method may also mutate internal state, such as writing to the backing + file. It's marked private because it will only be called within this class's + ``__enter__`` implementation to populate an internal length field. + + :raises Exception: implementations may raise any type of exception if the length + value could not be parsed, or any other issue which might + cause valid calls to ``self.fetch_content_range()`` to fail. + """ + raise NotImplementedError + + def _setup_content(self) -> None: + """Initialize the internal length field and other bookkeeping. + + After parsing the remote file length with ``self._fetch_content_length()``, + this method will truncate the underlying file from parent abstract class + ``ReadOnlyIOWrapper`` to that size in order to support seek operations against + ``io.SEEK_END`` in ``self.read()``. + + This method is called in ``__enter__``. + """ + super()._setup_content() + + if self._length is None: + logger.debug("begin fetching content length") + self._length = self._fetch_content_length() + logger.debug("done fetching content length (is: %d)", self._length) + else: + logger.debug("content length already fetched (is: %d)", self._length) + + # Enable us to seek and write anywhere in the backing file up to this + # known length. + self.truncate(self._length) + + def _reset_content(self) -> None: + """Unset the internal length field and other bookkeeping. + + This method is called in ``__exit__``.""" + super()._reset_content() + + if self._length is not None: + logger.debug("unsetting content length (was: %d)", self._length) + self._length = None + + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + + :raises ValueError: if ``__enter__`` was not called beforehand. + """ + if self._length is None: + raise ValueError(".__enter__() must be called to set up content length") + cur = self.tell() + logger.debug("read size %d at %d from lazy file %s", size, cur, self.name) + if size < 0: + assert cur <= self._length + download_size = self._length - cur + elif size == 0: + return b"" + else: + download_size = size + stop = min(cur + download_size, self._length) + self.ensure_downloaded(cur, stop) + return super().read(download_size) + + +class LazyHTTPFile(FixedSizeLazyResource): + """File-like object representing a fixed-length file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content into a temporary + file. If such requests are not supported by the server, raises + ``HTTPRangeRequestUnsupported`` in the ``__enter__`` method.""" + + def __init__( + self, + url: str, + session: Session | Authenticator, + delete_backing_file: bool = True, + ) -> None: + super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file))) + + self._request_count = 0 + self._session = session + self._url = url + + def _fetch_content_length(self) -> int: + """Get the remote file's length from a HEAD request.""" + # NB: This is currently dead code, as _fetch_content_length() is overridden + # again in LazyWheelOverHTTP. + return self._content_length_from_head() + + def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Perform a series of HTTP range requests to cover the specified byte range.""" + yield from self._stream_response(start, end).iter_content(CONTENT_CHUNK_SIZE) + + def __enter__(self) -> LazyHTTPFile: + """Fetch the remote file length and reset the log of downloaded intervals. + + This method must be called before ``.read()``. + + :raises HTTPRangeRequestUnsupported: if the remote server fails to indicate + support for "bytes" ranges. + """ + super().__enter__() + return self + + def __exit__(self, *exc: Any) -> None: + """Logs request count to quickly identify any pathological cases in log data.""" + logger.debug("%d requests for url %s", self._request_count, self._url) + super().__exit__(*exc) + + def _content_length_from_head(self) -> int: + """Performs a HEAD request to extract the Content-Length. + + :raises HTTPRangeRequestUnsupported: if the response fails to indicate support + for "bytes" ranges.""" + self._request_count += 1 + head = self._session.head(self._url, headers=self._uncached_headers()) + head.raise_for_status() + assert head.status_code == codes.ok + accepted_range = head.headers.get("Accept-Ranges", None) + if accepted_range != "bytes": + raise HTTPRangeRequestUnsupported( + f"server does not support byte ranges: header was '{accepted_range}'" + ) + return int(head.headers["Content-Length"]) + + def _stream_response(self, start: int, end: int) -> Response: + """Return streaming HTTP response to a range request from start to end.""" + headers = self._uncached_headers() + headers["Range"] = f"bytes={start}-{end}" + logger.debug("streamed bytes request: %s", headers["Range"]) + self._request_count += 1 + response = self._session.get(self._url, headers=headers, stream=True) + response.raise_for_status() + assert int(response.headers["Content-Length"]) == (end - start + 1) + return response + + @classmethod + def _uncached_headers(cls) -> dict[str, str]: + """HTTP headers to bypass any HTTP caching. + + The requests we perform in this file are intentionally small, and any caching + should be done at a higher level. + + Further, caching partial requests might cause issues: + https://github.com/pypa/pip/pull/8716 + """ + # "no-cache" is the correct value for "up to date every time", so this will also + # ensure we get the most recent value from the server: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching#provide_up-to-date_content_every_time + return {"Accept-Encoding": "identity", "Cache-Control": "no-cache"} + + +class LazyWheelOverHTTP(LazyHTTPFile): + """File-like object mapped to a ZIP file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content, which should be + provided as the first argument to a ``ZipFile``. + """ + + # Cache this on the type to avoid trying and failing our initial lazy wheel request + # multiple times in the same invocation against an index without this support. + _domains_without_negative_range: ClassVar[set[str]] = set() + + # This override is needed for mypy so we can call ``.prefetch_metadata()`` + # within a ``with`` block. + def __enter__(self) -> LazyWheelOverHTTP: + """Fetch the remote file length and reset the log of downloaded intervals. + + This method must be called before ``.read()`` or ``.prefetch_metadata()``. + """ + super().__enter__() + return self + + @classmethod + def _initial_chunk_length(cls) -> int: + """Return the size of the chunk (in bytes) to download from the end of the file. + + This method is called in ``self._fetch_content_length()``. As noted in that + method's docstring, this should be set high enough to cover the central + directory sizes of the *average* wheels you expect to see, in order to avoid + further requests before being able to process the zip file's contents at all. + If we choose a small number, we need one more range request for larger wheels. + If we choose a big number, we download unnecessary data from smaller wheels. + If the chunk size from this method is larger than the size of an entire wheel, + that may raise an HTTP error, but this is gracefully handled in + ``self._fetch_content_length()`` with a small performance penalty. + """ + return 10_000 + + def _fetch_content_length(self) -> int: + """Get the total remote file length, but also download a chunk from the end. + + This method is called within ``__enter__``. In an attempt to reduce + the total number of requests needed to populate this lazy file's contents, this + method will also attempt to fetch a chunk of the file's actual content. This + chunk will be ``self._initial_chunk_length()`` bytes in size, or just the remote + file's length if that's smaller, and the chunk will come from the *end* of + the file. + + This method will first attempt to download with a negative byte range request, + i.e. a GET with the headers ``Range: bytes=-N`` for ``N`` equal to + ``self._initial_chunk_length()``. If negative offsets are unsupported, it will + instead fall back to making a HEAD request first to extract the length, followed + by a GET request with the double-ended range header ``Range: bytes=X-Y`` to + extract the final ``N`` bytes from the remote resource. + """ + initial_chunk_size = self._initial_chunk_length() + ret_length, tail = self._extract_content_length(initial_chunk_size) + + # Need to explicitly truncate here in order to perform the write and seek + # operations below when we write the chunk of file contents to disk. + self.truncate(ret_length) + + if tail is None: + # If we could not download any file contents yet (e.g. if negative byte + # ranges were not supported, or the requested range was larger than the file + # size), then download all of this at once, hopefully pulling in the entire + # central directory. + initial_start = max(0, ret_length - initial_chunk_size) + self.ensure_downloaded(initial_start, ret_length) + else: + # If we *could* download some file contents, then write them to the end of + # the file and set up our bisect boundaries by hand. + with self._stay(): + response_length = int(tail.headers["Content-Length"]) + assert response_length == min(initial_chunk_size, ret_length) + self.seek(-response_length, io.SEEK_END) + # Default initial chunk size is currently 1MB, but streaming content + # here allows it to be set arbitrarily large. + for chunk in tail.iter_content(CONTENT_CHUNK_SIZE): + self._file.write(chunk) + + # We now need to update our bookkeeping to cover the interval we just + # wrote to file so we know not to do it in later read()s. + init_chunk_start = ret_length - response_length + # MergeIntervals uses inclusive boundaries i.e. start <= x <= end. + init_chunk_end = ret_length - 1 + assert self._merge_intervals is not None + assert ((init_chunk_start, init_chunk_end),) == tuple( + # NB: We expect LazyRemoteResource to reset `self._merge_intervals` + # just before it calls the current method, so our assertion here + # checks that indeed no prior overlapping intervals have + # been covered. + self._merge_intervals.minimal_intervals_covering( + init_chunk_start, init_chunk_end + ) + ) + return ret_length + + @staticmethod + def _parse_full_length_from_content_range(arg: str) -> int: + """Parse the file's full underlying length from the Content-Range header. + + This supports both * and numeric ranges, from success or error responses: + https://www.rfc-editor.org/rfc/rfc9110#field.content-range. + """ + m = re.match(r"bytes [^/]+/([0-9]+)", arg) + if m is None: + raise HTTPRangeRequestUnsupported(f"could not parse Content-Range: '{arg}'") + return int(m.group(1)) + + def _try_initial_chunk_request( + self, initial_chunk_size: int + ) -> tuple[int, Response]: + """Attempt to fetch a chunk from the end of the file with a negative offset.""" + headers = self._uncached_headers() + # Perform a negative range index, which is not supported by some servers. + headers["Range"] = f"bytes=-{initial_chunk_size}" + logger.debug("initial bytes request: %s", headers["Range"]) + + self._request_count += 1 + tail = self._session.get(self._url, headers=headers, stream=True) + tail.raise_for_status() + + code = tail.status_code + if code != codes.partial_content: + # According to + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests, + # a 200 OK implies that range requests are not supported, + # regardless of the requested size. + # However, some servers that support negative range requests also return a + # 200 OK if the requested range from the end was larger than the file size. + if code == codes.ok: + accept_ranges = tail.headers.get("Accept-Ranges", None) + content_length = int(tail.headers["Content-Length"]) + if accept_ranges == "bytes" and content_length <= initial_chunk_size: + return content_length, tail + + raise HTTPRangeRequestUnsupported( + f"did not receive partial content: got code {code}" + ) + + file_length = self._parse_full_length_from_content_range( + tail.headers["Content-Range"] + ) + return (file_length, tail) + + def _extract_content_length( + self, initial_chunk_size: int + ) -> tuple[int, Response | None]: + """Get the Content-Length of the remote file, and possibly a chunk of it.""" + domain = urlparse(self._url).netloc + if domain in self._domains_without_negative_range: + return (self._content_length_from_head(), None) + + tail: Response | None + try: + # Initial range request for just the end of the file. + file_length, tail = self._try_initial_chunk_request(initial_chunk_size) + except HTTPError as e: + resp = e.response + code = resp.status_code if resp is not None else None + # Our initial request using a negative byte range was not supported. + if code in [codes.not_implemented, codes.method_not_allowed]: + # pypi notably does not support negative byte ranges: see + # https://github.com/pypi/warehouse/issues/12823. + logger.debug( + "Negative byte range not supported for domain '%s': " + "using HEAD request before lazy wheel from now on", + domain, + ) + # Avoid trying a negative byte range request against this domain for the + # rest of the resolve. + self._domains_without_negative_range.add(domain) + # Apply a HEAD request to get the real size, and nothing else for now. + return (self._content_length_from_head(), None) + # This indicates that the requested range from the end was larger than the + # actual file size: https://www.rfc-editor.org/rfc/rfc9110#status.416. + if code == codes.requested_range_not_satisfiable: + # In this case, we don't have any file content yet, but we do know the + # size the file will be, so we can return that and exit here. + assert resp is not None + file_length = self._parse_full_length_from_content_range( + resp.headers["Content-Range"] + ) + return (file_length, None) + # If we get some other error, then we expect that non-range requests will + # also fail, so we error out here and let the user figure it out. + raise + + # Some servers that do not support negative offsets, + # handle a negative offset like "-10" as "0-10". + if int( + tail.headers["Content-Length"] + ) == initial_chunk_size + 1 and tail.headers["Content-Range"].startswith( + f"bytes 0-{initial_chunk_size}" + ): + tail = None + self._domains_without_negative_range.add(domain) + return file_length, tail + + def prefetch_metadata(self, name: str) -> None: + """Locate the *.dist-info/METADATA entry from a temporary ``ZipFile`` wrapper, + and download it. + + This method assumes that the *.dist-info directory (containing e.g. METADATA) is + contained in a single contiguous section of the zip file in order to ensure it + can be downloaded in a single ranged GET request.""" + logger.debug("begin prefetching METADATA for %s", name) + + dist_info_prefix = re.compile(r"^[^/]*\.dist-info/METADATA") + start: int | None = None + end: int | None = None + + # This may perform further requests if __init__() did not pull in the entire + # central directory at the end of the file (although _initial_chunk_length() + # should be set large enough to avoid this). + zf = ZipFile(self) + + for info in zf.infolist(): + if start is None: + if dist_info_prefix.search(info.filename): + start = info.header_offset + continue + else: + # The last .dist-info/ entry may be before the end of the file if the + # wheel's entries are sorted lexicographically (which is unusual). + if not dist_info_prefix.search(info.filename): + end = info.header_offset + break + if start is None: + raise UnsupportedWheel( + f"no {dist_info_prefix!r} found for {name} in {self.name}" + ) + # If it is the last entry of the zip, then give us everything + # until the start of the central directory. + if end is None: + end = zf.start_dir + logger.debug("fetch METADATA") + self.ensure_downloaded(start, end) + logger.debug("done prefetching METADATA for %s", name) diff --git a/src/poetry/repositories/http_repository.py b/src/poetry/repositories/http_repository.py index 826703a3760..546dfc4dcdc 100644 --- a/src/poetry/repositories/http_repository.py +++ b/src/poetry/repositories/http_repository.py @@ -19,12 +19,16 @@ from poetry.core.utils.helpers import temporary_directory from poetry.core.version.markers import parse_marker +from poetry.config.config import Config +from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported +from poetry.inspection.lazy_wheel import memory_wheel_from_url from poetry.repositories.cached_repository import CachedRepository from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.exceptions import RepositoryError from poetry.repositories.link_sources.html import HTMLPage from poetry.utils.authenticator import Authenticator from poetry.utils.constants import REQUESTS_TIMEOUT +from poetry.utils.helpers import HTTPRangeRequestSupported from poetry.utils.helpers import download_file from poetry.utils.patterns import wheel_file_re @@ -32,7 +36,6 @@ if TYPE_CHECKING: from packaging.utils import NormalizedName - from poetry.config.config import Config from poetry.inspection.info import PackageInfo from poetry.repositories.link_sources.base import LinkSource from poetry.utils.authenticator import RepositoryCertificateConfig @@ -49,6 +52,8 @@ def __init__( ) -> None: super().__init__(name, disable_cache, config) self._url = url + if config is None: + config = Config.create() self._authenticator = Authenticator( config=config, cache_id=name, @@ -58,6 +63,16 @@ def __init__( self._authenticator.add_repository(name, url) self.get_page = functools.lru_cache(maxsize=None)(self._get_page) + self._lazy_wheel = config.get("solver.lazy-wheel", True) + # We are tracking if a domain supports range requests or not to avoid + # unnecessary requests. + # ATTENTION: A domain might support range requests only for some files, so the + # meaning is as follows: + # - Domain not in dict: We don't know anything. + # - True: The domain supports range requests for at least some files. + # - False: The domain does not support range requests for the files we tried. + self._supports_range_requests: dict[str, bool] = {} + @property def session(self) -> Authenticator: return self._authenticator @@ -74,22 +89,63 @@ def certificates(self) -> RepositoryCertificateConfig: def authenticated_url(self) -> str: return self._authenticator.authenticated_url(url=self.url) - def _download(self, url: str, dest: Path) -> None: - return download_file(url, dest, session=self.session) + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: + return download_file( + url, dest, session=self.session, raise_accepts_ranges=raise_accepts_ranges + ) @contextmanager - def _cached_or_downloaded_file(self, link: Link) -> Iterator[Path]: + def _cached_or_downloaded_file( + self, link: Link, *, raise_accepts_ranges: bool = False + ) -> Iterator[Path]: self._log(f"Downloading: {link.url}", level="debug") with temporary_directory() as temp_dir: filepath = Path(temp_dir) / link.filename - self._download(link.url, filepath) + self._download( + link.url, filepath, raise_accepts_ranges=raise_accepts_ranges + ) yield filepath def _get_info_from_wheel(self, url: str) -> PackageInfo: from poetry.inspection.info import PackageInfo - with self._cached_or_downloaded_file(Link(url)) as filepath: - return PackageInfo.from_wheel(filepath) + link = Link(url) + netloc = link.netloc + + # If "lazy-wheel" is enabled and the domain supports range requests + # or we don't know yet, we try range requests. + if self._lazy_wheel and self._supports_range_requests.get(netloc, True): + try: + package_info = PackageInfo.from_memory_wheel( + memory_wheel_from_url(link.filename, link.url, self.session) + ) + except HTTPRangeRequestUnsupported: + # Do not set to False if we already know that the domain supports + # range requests for some URLs! + if netloc not in self._supports_range_requests: + self._supports_range_requests[netloc] = False + else: + self._supports_range_requests[netloc] = True + return package_info + + try: + with self._cached_or_downloaded_file( + link, raise_accepts_ranges=self._lazy_wheel + ) as filepath: + return PackageInfo.from_wheel(filepath) + except HTTPRangeRequestSupported: + # The domain did not support range requests for the first URL(s) we tried, + # but supports it for some URLs (especially the current URL), + # so we abort the download, update _supports_range_requests to try + # range requests for all files and use it for the current URL. + self._log( + f"Abort downloading {link.url} because server supports range requests", + level="debug", + ) + self._supports_range_requests[netloc] = True + return self._get_info_from_wheel(link.url) def _get_info_from_sdist(self, url: str) -> PackageInfo: from poetry.inspection.info import PackageInfo diff --git a/src/poetry/utils/authenticator.py b/src/poetry/utils/authenticator.py index 2628e5ba699..c5e913db630 100644 --- a/src/poetry/utils/authenticator.py +++ b/src/poetry/utils/authenticator.py @@ -270,6 +270,10 @@ def _get_backoff(self, response: requests.Response | None, attempt: int) -> floa def get(self, url: str, **kwargs: Any) -> requests.Response: return self.request("get", url, **kwargs) + def head(self, url: str, **kwargs: Any) -> requests.Response: + kwargs.setdefault("allow_redirects", False) + return self.request("head", url, **kwargs) + def post(self, url: str, **kwargs: Any) -> requests.Response: return self.request("post", url, **kwargs) diff --git a/src/poetry/utils/helpers.py b/src/poetry/utils/helpers.py index 9916be7b87f..fbc29e5b4dd 100644 --- a/src/poetry/utils/helpers.py +++ b/src/poetry/utils/helpers.py @@ -122,16 +122,25 @@ def merge_dicts(d1: dict[str, Any], d2: dict[str, Any]) -> None: d1[k] = d2[k] +class HTTPRangeRequestSupported(Exception): + """Raised when server unexpectedly supports byte ranges.""" + + def download_file( url: str, dest: Path, + *, session: Authenticator | Session | None = None, chunk_size: int = 1024, + raise_accepts_ranges: bool = False, ) -> None: from poetry.puzzle.provider import Indicator downloader = Downloader(url, dest, session) + if raise_accepts_ranges and downloader.accepts_ranges: + raise HTTPRangeRequestSupported(f"URL {url} supports range requests.") + set_indicator = False with Indicator.context() as update_context: update_context(f"Downloading {url}") @@ -170,6 +179,10 @@ def __init__( ) self._response.raise_for_status() + @cached_property + def accepts_ranges(self) -> bool: + return self._response.headers.get("Accept-Ranges") == "bytes" + @cached_property def total_size(self) -> int: total_size = 0 diff --git a/tests/console/commands/test_config.py b/tests/console/commands/test_config.py index 24b89a63d9b..6c15e6a35fb 100644 --- a/tests/console/commands/test_config.py +++ b/tests/console/commands/test_config.py @@ -13,6 +13,7 @@ from poetry.config.config_source import ConfigSource from poetry.console.commands.install import InstallCommand from poetry.factory import Factory +from poetry.repositories.legacy_repository import LegacyRepository from tests.conftest import Config @@ -58,6 +59,7 @@ def test_list_displays_default_value_if_not_set( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -88,6 +90,7 @@ def test_list_displays_set_get_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = false virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -139,6 +142,7 @@ def test_unset_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -168,6 +172,7 @@ def test_unset_repo_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -296,6 +301,7 @@ def test_list_displays_set_get_local_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = false virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -334,6 +340,7 @@ def test_list_must_not_display_sources_from_pyproject_toml( installer.no-binary = null installer.parallel = true repositories.foo.url = "https://foo.bar/simple/" +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -531,3 +538,21 @@ def test_config_installer_no_binary( config = Config.create(reload=True) assert not DeepDiff(config.get(setting), expected, ignore_order=True) + + +def test_config_solver_lazy_wheel( + tester: CommandTester, command_tester_factory: CommandTesterFactory +) -> None: + tester.execute("--local solver.lazy-wheel") + assert tester.io.fetch_output().strip() == "true" + + repo = LegacyRepository("foo", "https://foo.com") + assert repo._lazy_wheel + + tester.io.clear_output() + tester.execute("--local solver.lazy-wheel false") + tester.execute("--local solver.lazy-wheel") + assert tester.io.fetch_output().strip() == "false" + + repo = LegacyRepository("foo", "https://foo.com") + assert not repo._lazy_wheel diff --git a/tests/fixtures/distributions/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl b/tests/fixtures/distributions/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..d59d962e77f1c8c87078f970ccc8a64d99e2f4be GIT binary patch literal 310 zcmWIWW@Zs#0Dn`T;;CTu?zph9<7UmtsI32n)cJ#K&jmWtPOp z>lIYS$CsrR6=&w>#m6hyDySRi8R{9Ra|L)aGTAfWvPcDLI> None: +def mock_download( + url: str, + dest: Path, + *, + session: Authenticator | Session | None = None, + chunk_size: int = 1024, + raise_accepts_ranges: bool = False, +) -> None: parts = urllib.parse.urlparse(url) fixture = FIXTURE_PATH / parts.path.lstrip("/") diff --git a/tests/inspection/conftest.py b/tests/inspection/conftest.py new file mode 100644 index 00000000000..87edc940adf --- /dev/null +++ b/tests/inspection/conftest.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any +from typing import Dict +from typing import Protocol +from typing import Tuple +from urllib.parse import urlparse + +import pytest + +from requests import codes + + +if TYPE_CHECKING: + from collections.abc import Callable + + from httpretty.core import HTTPrettyRequest + + from tests.types import FixtureDirGetter + + HttPrettyResponse = Tuple[int, Dict[str, Any], bytes] # status code, headers, body + HttPrettyRequestCallback = Callable[ + [HTTPrettyRequest, str, Dict[str, Any]], HttPrettyResponse + ] + + class RequestCallbackFactory(Protocol): + def __call__( + self, + *, + accept_ranges: str | None = "bytes", + negative_offset_error: tuple[int, bytes] | None = None, + ) -> HttPrettyRequestCallback: ... + + +NEGATIVE_OFFSET_AS_POSITIVE = -1 + + +def build_head_response( + accept_ranges: str | None, content_length: int, response_headers: dict[str, Any] +) -> HttPrettyResponse: + response_headers["Content-Length"] = content_length + if accept_ranges: + response_headers["Accept-Ranges"] = accept_ranges + return 200, response_headers, b"" + + +def build_partial_response( + rng: str, + wheel_bytes: bytes, + response_headers: dict[str, Any], + *, + negative_offset_as_positive: bool = False, +) -> HttPrettyResponse: + status_code = 206 + response_headers["Accept-Ranges"] = "bytes" + total_length = len(wheel_bytes) + if rng.startswith("-"): + # negative offset + offset = int(rng) + if negative_offset_as_positive: + # some servers interpret a negative offset like "-10" as "0-10" + start = 0 + end = min(-offset, total_length - 1) + body = wheel_bytes[start : end + 1] + else: + start = total_length + offset + if start < 0: + # wheel is smaller than initial chunk size + return 200, response_headers, wheel_bytes + end = total_length - 1 + body = wheel_bytes[offset:] + else: + # range with start and end + rng_parts = rng.split("-") + start = int(rng_parts[0]) + end = int(rng_parts[1]) + body = wheel_bytes[start : end + 1] + response_headers["Content-Range"] = f"bytes {start}-{end}/{total_length}" + return status_code, response_headers, body + + +@pytest.fixture +def handle_request_factory(fixture_dir: FixtureDirGetter) -> RequestCallbackFactory: + def _factory( + *, + accept_ranges: str | None = "bytes", + negative_offset_error: tuple[int, bytes] | None = None, + ) -> HttPrettyRequestCallback: + def handle_request( + request: HTTPrettyRequest, uri: str, response_headers: dict[str, Any] + ) -> HttPrettyResponse: + name = Path(urlparse(uri).path).name + + wheel = Path(__file__).parent.parent.joinpath( + "repositories/fixtures/pypi.org/dists/" + name + ) + + if not wheel.exists(): + wheel = fixture_dir("distributions") / name + + if not wheel.exists(): + wheel = ( + fixture_dir("distributions") / "demo-0.1.0-py2.py3-none-any.whl" + ) + + wheel_bytes = wheel.read_bytes() + + del response_headers["status"] + + if request.method == "HEAD": + return build_head_response( + accept_ranges, len(wheel_bytes), response_headers + ) + + rng = request.headers.get("Range", "") + if rng: + rng = rng.split("=")[1] + + negative_offset_as_positive = False + if negative_offset_error and rng.startswith("-"): + if negative_offset_error[0] == codes.requested_range_not_satisfiable: + response_headers["Content-Range"] = f"bytes */{len(wheel_bytes)}" + if negative_offset_error[0] == NEGATIVE_OFFSET_AS_POSITIVE: + negative_offset_as_positive = True + else: + return ( + negative_offset_error[0], + response_headers, + negative_offset_error[1], + ) + if accept_ranges == "bytes" and rng: + return build_partial_response( + rng, + wheel_bytes, + response_headers, + negative_offset_as_positive=negative_offset_as_positive, + ) + + status_code = 200 + body = wheel_bytes + + return status_code, response_headers, body + + return handle_request + + return _factory diff --git a/tests/inspection/test_info.py b/tests/inspection/test_info.py index dfa0f1dafc8..3e59c7e518b 100644 --- a/tests/inspection/test_info.py +++ b/tests/inspection/test_info.py @@ -6,9 +6,12 @@ from typing import TYPE_CHECKING import pytest +import requests from poetry.inspection.info import PackageInfo from poetry.inspection.info import PackageInfoError +from poetry.inspection.lazy_wheel import MemoryWheel +from poetry.inspection.lazy_wheel import memory_wheel_from_url from poetry.utils.env import EnvCommandError from poetry.utils.env import VirtualEnv @@ -16,8 +19,10 @@ if TYPE_CHECKING: from pathlib import Path + from httpretty import httpretty from pytest_mock import MockerFixture + from tests.inspection.conftest import RequestCallbackFactory from tests.types import FixtureDirGetter @@ -36,6 +41,18 @@ def demo_wheel(fixture_dir: FixtureDirGetter) -> Path: return fixture_dir("distributions") / "demo-0.1.0-py2.py3-none-any.whl" +@pytest.fixture +def demo_memory_wheel( + http: type[httpretty], + handle_request_factory: RequestCallbackFactory, +) -> MemoryWheel: + url = "https://foo.com/demo-0.1.0-py2.py3-none-any.whl" + request_callback = handle_request_factory() + http.register_uri(http.GET, url, body=request_callback) + + return memory_wheel_from_url("demo", url, requests.Session()) + + @pytest.fixture def source_dir(tmp_path: Path) -> Path: path = tmp_path / "source" @@ -162,6 +179,13 @@ def test_info_from_wheel(demo_wheel: Path) -> None: assert info._source_url == demo_wheel.resolve().as_posix() +def test_info_from_memory_wheel(demo_memory_wheel: MemoryWheel) -> None: + info = PackageInfo.from_memory_wheel(demo_memory_wheel) + demo_check_info(info) + assert info._source_type is None + assert info._source_url is None + + def test_info_from_bdist(demo_wheel: Path) -> None: info = PackageInfo.from_bdist(demo_wheel) demo_check_info(info) diff --git a/tests/inspection/test_lazy_wheel.py b/tests/inspection/test_lazy_wheel.py new file mode 100644 index 00000000000..6c21b86a71a --- /dev/null +++ b/tests/inspection/test_lazy_wheel.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import re + +from typing import TYPE_CHECKING + +import httpretty +import pytest +import requests + +from requests import codes + +from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported +from poetry.inspection.lazy_wheel import InvalidWheel +from poetry.inspection.lazy_wheel import memory_wheel_from_url +from tests.inspection.conftest import NEGATIVE_OFFSET_AS_POSITIVE + + +if TYPE_CHECKING: + from tests.inspection.conftest import RequestCallbackFactory + + +@pytest.mark.parametrize( + "negative_offset_error", + [ + None, + (codes.method_not_allowed, b"Method not allowed"), + (codes.requested_range_not_satisfiable, b"Requested range not satisfiable"), + (codes.not_implemented, b"Unsupported client range"), + (NEGATIVE_OFFSET_AS_POSITIVE, b"handle negative offset as positive"), + ], +) +def test_memory_wheel_from_url( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + negative_offset_error: tuple[int, bytes] | None, +) -> None: + domain = ( + f"lazy-wheel-{negative_offset_error[0] if negative_offset_error else 0}.com" + ) + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory( + negative_offset_error=negative_offset_error + ) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" + + wheel = memory_wheel_from_url("poetry-core", url, requests.Session()) + + assert wheel.name == "poetry-core" + assert wheel.version == "1.5.0" + assert wheel.author == "Sébastien Eustace" + assert wheel.requires_dist == [ + 'importlib-metadata (>=1.7.0) ; python_version < "3.8"' + ] + + # negative offsets supported: + # 1. end of central directory + # 2. whole central directory + # 3. METADATA file + # negative offsets not supported: + # 1. failed range request + # 2. HEAD request + # 3.-5. see negative offsets 1.-3. + expected_requests = 3 + if negative_offset_error: + if negative_offset_error[0] in ( + codes.requested_range_not_satisfiable, + NEGATIVE_OFFSET_AS_POSITIVE, + ): + expected_requests += 1 + else: + expected_requests += 2 + latest_requests = http.latest_requests() + assert len(latest_requests) == expected_requests + + # second wheel -> one less request if negative offsets are not supported + latest_requests.clear() + memory_wheel_from_url("poetry-core", url, requests.Session()) + expected_requests = min(expected_requests, 4) + latest_requests = httpretty.latest_requests() + assert len(latest_requests) == expected_requests + + +@pytest.mark.parametrize("negative_offset_as_positive", [False, True]) +def test_memory_wheel_from_url_smaller_than_initial_chunk_size( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + negative_offset_as_positive: bool, +) -> None: + domain = f"tiny-wheel-{str(negative_offset_as_positive).lower()}.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory( + negative_offset_error=( + (NEGATIVE_OFFSET_AS_POSITIVE, b"") if negative_offset_as_positive else None + ) + ) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/zipp-3.5.0-py3-none-any.whl" + + wheel = memory_wheel_from_url("zipp", url, requests.Session()) + + assert wheel.name == "zipp" + assert wheel.version == "3.5.0" + assert wheel.author == "Jason R. Coombs" + assert len(wheel.requires_dist) == 12 + + # only one request because server gives a normal response with the entire wheel + # except for when server interprets negative offset as positive + latest_requests = http.latest_requests() + assert len(latest_requests) == 1 + + +@pytest.mark.parametrize("accept_ranges", [None, "none"]) +def test_memory_wheel_from_url_range_requests_not_supported_one_request( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + accept_ranges: str | None, +) -> None: + domain = "no-range-requests.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory(accept_ranges=accept_ranges) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" + + with pytest.raises(HTTPRangeRequestUnsupported): + memory_wheel_from_url("poetry-core", url, requests.Session()) + + latest_requests = http.latest_requests() + assert len(latest_requests) == 1 + assert latest_requests[0].method == "GET" + + +@pytest.mark.parametrize( + "negative_offset_error", + [ + (codes.method_not_allowed, b"Method not allowed"), + (codes.not_implemented, b"Unsupported client range"), + ], +) +def test_memory_wheel_from_url_range_requests_not_supported_two_requests( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + negative_offset_error: tuple[int, bytes], +) -> None: + domain = f"no-negative-offsets-{negative_offset_error[0]}.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory( + accept_ranges=None, negative_offset_error=negative_offset_error + ) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" + + with pytest.raises(HTTPRangeRequestUnsupported): + memory_wheel_from_url("poetry-core", url, requests.Session()) + + latest_requests = http.latest_requests() + assert len(latest_requests) == 2 + assert latest_requests[0].method == "GET" + assert latest_requests[1].method == "HEAD" + + +def test_memory_wheel_from_url_invalid_wheel( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, +) -> None: + domain = "invalid-wheel.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory() + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl" + + with pytest.raises(InvalidWheel): + memory_wheel_from_url("demo-missing-dist-info", url, requests.Session()) + + latest_requests = http.latest_requests() + assert len(latest_requests) == 1 + assert latest_requests[0].method == "GET" diff --git a/tests/repositories/test_http_repository.py b/tests/repositories/test_http_repository.py new file mode 100644 index 00000000000..0eb0e9046e0 --- /dev/null +++ b/tests/repositories/test_http_repository.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import shutil + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported +from poetry.inspection.lazy_wheel import MemoryWheel +from poetry.repositories.http_repository import HTTPRepository +from poetry.utils.helpers import HTTPRangeRequestSupported + + +if TYPE_CHECKING: + from packaging.utils import NormalizedName + from poetry.core.constraints.version import Version + from pytest_mock import MockerFixture + + +class MockRepository(HTTPRepository): + DIST_FIXTURES = Path(__file__).parent / "fixtures" / "pypi.org" / "dists" + + def __init__(self, lazy_wheel: bool = True) -> None: + super().__init__("foo", "https://foo.com") + self._lazy_wheel = lazy_wheel + + def _get_release_info( + self, name: NormalizedName, version: Version + ) -> dict[str, Any]: + raise NotImplementedError + + +@pytest.mark.parametrize("lazy_wheel", [False, True]) +@pytest.mark.parametrize("supports_range_requests", [None, False, True]) +def test_get_info_from_wheel( + mocker: MockerFixture, lazy_wheel: bool, supports_range_requests: bool | None +) -> None: + filename = "poetry_core-1.5.0-py3-none-any.whl" + filepath = MockRepository.DIST_FIXTURES / filename + + mock_memory_wheel_from_url = mocker.patch( + "poetry.repositories.http_repository.memory_wheel_from_url", + return_value=MemoryWheel(filepath), # type: ignore[arg-type] + ) + mock_download = mocker.patch( + "poetry.repositories.http_repository.download_file", + side_effect=lambda _, dest, *args, **kwargs: shutil.copy(filepath, dest), + ) + + domain = "foo.com" + url = f"https://{domain}/{filename}" + repo = MockRepository(lazy_wheel) + assert not repo._supports_range_requests + if lazy_wheel and supports_range_requests is not None: + repo._supports_range_requests[domain] = supports_range_requests + + info = repo._get_info_from_wheel(url) + assert info.name == "poetry-core" + assert info.version == "1.5.0" + assert info.requires_dist == [ + 'importlib-metadata (>=1.7.0) ; python_version < "3.8"' + ] + + if lazy_wheel and (supports_range_requests or supports_range_requests is None): + mock_memory_wheel_from_url.assert_called_once_with(filename, url, repo.session) + mock_download.assert_not_called() + assert repo._supports_range_requests[domain] is True + else: + mock_memory_wheel_from_url.assert_not_called() + mock_download.assert_called_once_with( + url, mocker.ANY, session=repo.session, raise_accepts_ranges=lazy_wheel + ) + if lazy_wheel: + assert repo._supports_range_requests[domain] is False + else: + assert domain not in repo._supports_range_requests + + +def test_get_info_from_wheel_state_sequence(mocker: MockerFixture) -> None: + """ + 1. We know nothing: + Try range requests, which are not supported and fall back to complete download. + 2. Range requests were not supported so far: + We do not try range requests again. + 3. Range requests were still not supported so far: + We do not try range requests again, but we notice that the response header + contains "Accept-Ranges: bytes", so range requests are at least supported + for some files, which means we want to try again. + 4. Range requests are supported for some files: + We try range requests (success). + 5. Range requests are supported for some files: + We try range requests (failure), but do not forget that range requests are + supported for some files. + 6. Range requests are supported for some files: + We try range requests (success). + """ + mock_memory_wheel_from_url = mocker.patch( + "poetry.repositories.http_repository.memory_wheel_from_url" + ) + mock_download = mocker.patch("poetry.repositories.http_repository.download_file") + + filename = "poetry_core-1.5.0-py3-none-any.whl" + domain = "foo.com" + url = f"https://{domain}/{filename}" + repo = MockRepository() + + # 1. range request and download + mock_memory_wheel_from_url.side_effect = HTTPRangeRequestUnsupported + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 1 + assert mock_download.call_count == 1 + + # 2. only download + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 1 + assert mock_download.call_count == 2 + + # 3. range request and download + mock_memory_wheel_from_url.side_effect = None + mock_download.side_effect = HTTPRangeRequestSupported + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 2 + assert mock_download.call_count == 3 + + # 4. only range request + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 3 + assert mock_download.call_count == 3 + + # 5. range request and download + mock_memory_wheel_from_url.side_effect = HTTPRangeRequestUnsupported + mock_download.side_effect = None + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 4 + assert mock_download.call_count == 4 + + # 6. only range request + mock_memory_wheel_from_url.side_effect = None + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 5 + assert mock_download.call_count == 4 diff --git a/tests/repositories/test_legacy_repository.py b/tests/repositories/test_legacy_repository.py index 3577836d420..71b831641fa 100644 --- a/tests/repositories/test_legacy_repository.py +++ b/tests/repositories/test_legacy_repository.py @@ -41,6 +41,7 @@ class MockRepository(LegacyRepository): def __init__(self) -> None: super().__init__("legacy", url="http://legacy.foo.bar", disable_cache=True) + self._lazy_wheel = False def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: fixture = self.FIXTURES / (name + ".html") @@ -50,7 +51,9 @@ def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: with fixture.open(encoding="utf-8") as f: return SimpleRepositoryPage(self._url + f"/{name}/", f.read()) - def _download(self, url: str, dest: Path) -> None: + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: filename = Link(url).filename filepath = self.FIXTURES.parent / "pypi.org" / "dists" / filename diff --git a/tests/repositories/test_pypi_repository.py b/tests/repositories/test_pypi_repository.py index 9169430729e..a5a437efac8 100644 --- a/tests/repositories/test_pypi_repository.py +++ b/tests/repositories/test_pypi_repository.py @@ -38,6 +38,7 @@ class MockRepository(PyPiRepository): def __init__(self, fallback: bool = False) -> None: super().__init__(url="http://foo.bar", disable_cache=True, fallback=fallback) + self._lazy_wheel = False def get_json_page(self, name: NormalizedName) -> SimpleJsonPage: fixture = self.JSON_FIXTURES / (name + ".json") @@ -66,7 +67,9 @@ def _get( data: dict[str, Any] = json.load(f) return data - def _download(self, url: str, dest: Path) -> None: + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: filename = url.split("/")[-1] fixture = self.DIST_FIXTURES / filename diff --git a/tests/repositories/test_single_page_repository.py b/tests/repositories/test_single_page_repository.py index cce758bb312..bbe3002c8e7 100644 --- a/tests/repositories/test_single_page_repository.py +++ b/tests/repositories/test_single_page_repository.py @@ -25,6 +25,7 @@ def __init__(self, page: str) -> None: url=f"http://single-page.foo.bar/{page}.html", disable_cache=True, ) + self._lazy_wheel = False def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: fixture = self.FIXTURES / self.url.rsplit("/", 1)[-1] @@ -34,7 +35,9 @@ def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: with fixture.open(encoding="utf-8") as f: return SimpleRepositoryPage(self._url, f.read()) - def _download(self, url: str, dest: Path) -> None: + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: raise RuntimeError("Tests are not configured for downloads") diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py index e4713f80ff1..2399e298552 100644 --- a/tests/utils/test_helpers.py +++ b/tests/utils/test_helpers.py @@ -1,11 +1,13 @@ from __future__ import annotations from typing import TYPE_CHECKING +from typing import Any import pytest from poetry.core.utils.helpers import parse_requires +from poetry.utils.helpers import HTTPRangeRequestSupported from poetry.utils.helpers import download_file from poetry.utils.helpers import get_file_hash from poetry.utils.helpers import get_highest_priority_hash_type @@ -15,6 +17,7 @@ from pathlib import Path from httpretty import httpretty + from httpretty.core import HTTPrettyRequest from tests.types import FixtureDirGetter @@ -153,3 +156,35 @@ def test_download_file( ) def test_highest_priority_hash_type(hash_types: set[str], expected: str | None) -> None: assert get_highest_priority_hash_type(hash_types, "Blah") == expected + + +@pytest.mark.parametrize("accepts_ranges", [False, True]) +@pytest.mark.parametrize("raise_accepts_ranges", [False, True]) +def test_download_file_raise_accepts_ranges( + http: type[httpretty], + fixture_dir: FixtureDirGetter, + tmp_path: Path, + accepts_ranges: bool, + raise_accepts_ranges: bool, +) -> None: + filename = "demo-0.1.0-py2.py3-none-any.whl" + + def handle_request( + request: HTTPrettyRequest, uri: str, response_headers: dict[str, Any] + ) -> tuple[int, dict[str, Any], bytes]: + file_path = fixture_dir("distributions") / filename + if accepts_ranges: + response_headers["Accept-Ranges"] = "bytes" + return 200, response_headers, file_path.read_bytes() + + url = f"https://foo.com/{filename}" + http.register_uri(http.GET, url, body=handle_request) + dest = tmp_path / filename + + if accepts_ranges and raise_accepts_ranges: + with pytest.raises(HTTPRangeRequestSupported): + download_file(url, dest, raise_accepts_ranges=raise_accepts_ranges) + assert not dest.exists() + else: + download_file(url, dest, raise_accepts_ranges=raise_accepts_ranges) + assert dest.is_file() From f10e0066045786469f2a64ff441ce4ae75449dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Fri, 22 Dec 2023 18:30:14 +0100 Subject: [PATCH 2/9] use packaging.metadata instead of pkginfo --- src/poetry/inspection/info.py | 28 ++-- src/poetry/inspection/lazy_wheel.py | 59 +++---- src/poetry/repositories/http_repository.py | 6 +- tests/inspection/conftest.py | 148 ----------------- tests/inspection/test_info.py | 41 +++-- tests/inspection/test_lazy_wheel.py | 180 ++++++++++++++++++--- tests/repositories/test_http_repository.py | 42 ++--- 7 files changed, 253 insertions(+), 251 deletions(-) delete mode 100644 tests/inspection/conftest.py diff --git a/src/poetry/inspection/info.py b/src/poetry/inspection/info.py index 9afb4612b37..362cb6a7370 100644 --- a/src/poetry/inspection/info.py +++ b/src/poetry/inspection/info.py @@ -30,10 +30,9 @@ if TYPE_CHECKING: from collections.abc import Iterator + from packaging.metadata import RawMetadata from poetry.core.packages.project_package import ProjectPackage - from poetry.inspection.lazy_wheel import MemoryWheel - logger = logging.getLogger(__name__) @@ -231,7 +230,9 @@ def to_package( return package @classmethod - def _from_distribution(cls, dist: pkginfo.Distribution) -> PackageInfo: + def _from_distribution( + cls, dist: pkginfo.BDist | pkginfo.SDist | pkginfo.Wheel + ) -> PackageInfo: """ Helper method to parse package information from a `pkginfo.Distribution` instance. @@ -242,7 +243,7 @@ def _from_distribution(cls, dist: pkginfo.Distribution) -> PackageInfo: if dist.requires_dist: requirements = list(dist.requires_dist) - elif isinstance(dist, (pkginfo.BDist, pkginfo.SDist, pkginfo.Wheel)): + else: requires = Path(dist.filename) / "requires.txt" if requires.exists(): text = requires.read_text(encoding="utf-8") @@ -256,9 +257,8 @@ def _from_distribution(cls, dist: pkginfo.Distribution) -> PackageInfo: requires_python=dist.requires_python, ) - if isinstance(dist, (pkginfo.BDist, pkginfo.SDist, pkginfo.Wheel)): - info._source_type = "file" - info._source_url = Path(dist.filename).resolve().as_posix() + info._source_type = "file" + info._source_url = Path(dist.filename).resolve().as_posix() return info @@ -522,13 +522,19 @@ def from_wheel(cls, path: Path) -> PackageInfo: return PackageInfo() @classmethod - def from_memory_wheel(cls, wheel: MemoryWheel) -> PackageInfo: + def from_wheel_metadata(cls, metadata: RawMetadata) -> PackageInfo: """ - Gather package information from a partial fetched wheel kept in memory. + Gather package information from metadata of a remote wheel. - :param path: Path to wheel. + :param metadata: metadata of the wheel. """ - return cls._from_distribution(wheel) + return cls( + name=metadata.get("name"), + version=metadata.get("version"), + summary=metadata.get("summary"), + requires_dist=metadata.get("requires_dist"), + requires_python=metadata.get("requires_python"), + ) @classmethod def from_bdist(cls, path: Path) -> PackageInfo: diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py index 7542ed3abe0..0361d788594 100644 --- a/src/poetry/inspection/lazy_wheel.py +++ b/src/poetry/inspection/lazy_wheel.py @@ -20,7 +20,7 @@ from zipfile import BadZipFile from zipfile import ZipFile -from pkginfo import Distribution +from packaging.metadata import parse_email from requests.models import CONTENT_CHUNK_SIZE from requests.models import HTTPError from requests.models import Response @@ -31,6 +31,7 @@ from collections.abc import Iterable from collections.abc import Iterator + from packaging.metadata import RawMetadata from requests import Session from poetry.utils.authenticator import Authenticator @@ -58,29 +59,10 @@ def __str__(self) -> str: return f"Wheel '{self.name}' located at {self.location} is invalid." -class MemoryWheel(Distribution): - def __init__(self, lazy_file: LazyWheelOverHTTP) -> None: - self.lazy_file = lazy_file - self.extractMetadata() - - def read(self) -> bytes: - with ZipFile(self.lazy_file) as archive: - tuples = [x.split("/") for x in archive.namelist() if "METADATA" in x] - schwarz = sorted([(len(x), x) for x in tuples]) - for path in [x[1] for x in schwarz]: - candidate = "/".join(path) - logger.debug(f"read {candidate}") - data = archive.read(candidate) - if b"Metadata-Version" in data: - return data - else: - raise ValueError(f"No METADATA in archive: {self.lazy_file.name}") - - -def memory_wheel_from_url( +def metadata_from_wheel_url( name: str, url: str, session: Session | Authenticator -) -> MemoryWheel: - """Return a MemoryWheel (compatible to pkginfo.Wheel) from the given wheel URL. +) -> RawMetadata: + """Fetch metadata from the given wheel URL. This uses HTTP range requests to only fetch the portion of the wheel containing metadata, just enough for the object to be constructed. @@ -92,10 +74,10 @@ def memory_wheel_from_url( # After context manager exit, wheel.name will point to a deleted file path. # Add `delete_backing_file=False` to disable this for debugging. with LazyWheelOverHTTP(url, session) as lazy_file: - # prefetch metadata to reduce the number of range requests - # (we know that METADATA is the only file from the wheel we need) - lazy_file.prefetch_metadata(name) - return MemoryWheel(lazy_file) + metadata_bytes = lazy_file.read_metadata(name) + + metadata, _ = parse_email(metadata_bytes) + return metadata except (BadZipFile, UnsupportedWheel): # We assume that these errors have occurred because the wheel contents @@ -720,7 +702,7 @@ def _extract_content_length( self._domains_without_negative_range.add(domain) return file_length, tail - def prefetch_metadata(self, name: str) -> None: + def _prefetch_metadata(self, name: str) -> str: """Locate the *.dist-info/METADATA entry from a temporary ``ZipFile`` wrapper, and download it. @@ -729,7 +711,7 @@ def prefetch_metadata(self, name: str) -> None: can be downloaded in a single ranged GET request.""" logger.debug("begin prefetching METADATA for %s", name) - dist_info_prefix = re.compile(r"^[^/]*\.dist-info/METADATA") + metadata_regex = re.compile(r"^[^/]*\.dist-info/METADATA$") start: int | None = None end: int | None = None @@ -738,25 +720,36 @@ def prefetch_metadata(self, name: str) -> None: # should be set large enough to avoid this). zf = ZipFile(self) + filename = "" for info in zf.infolist(): if start is None: - if dist_info_prefix.search(info.filename): + if metadata_regex.search(info.filename): + filename = info.filename start = info.header_offset continue else: # The last .dist-info/ entry may be before the end of the file if the # wheel's entries are sorted lexicographically (which is unusual). - if not dist_info_prefix.search(info.filename): + if not metadata_regex.search(info.filename): end = info.header_offset break if start is None: raise UnsupportedWheel( - f"no {dist_info_prefix!r} found for {name} in {self.name}" + f"no {metadata_regex!r} found for {name} in {self.name}" ) # If it is the last entry of the zip, then give us everything # until the start of the central directory. if end is None: end = zf.start_dir - logger.debug("fetch METADATA") + logger.debug(f"fetch {filename}") self.ensure_downloaded(start, end) logger.debug("done prefetching METADATA for %s", name) + + return filename + + def read_metadata(self, name: str) -> bytes: + """Download and read the METADATA file from the remote wheel.""" + with ZipFile(self) as zf: + # prefetch metadata to reduce the number of range requests + filename = self._prefetch_metadata(name) + return zf.read(filename) diff --git a/src/poetry/repositories/http_repository.py b/src/poetry/repositories/http_repository.py index 546dfc4dcdc..9c89ec83ee1 100644 --- a/src/poetry/repositories/http_repository.py +++ b/src/poetry/repositories/http_repository.py @@ -21,7 +21,7 @@ from poetry.config.config import Config from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported -from poetry.inspection.lazy_wheel import memory_wheel_from_url +from poetry.inspection.lazy_wheel import metadata_from_wheel_url from poetry.repositories.cached_repository import CachedRepository from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.exceptions import RepositoryError @@ -118,8 +118,8 @@ def _get_info_from_wheel(self, url: str) -> PackageInfo: # or we don't know yet, we try range requests. if self._lazy_wheel and self._supports_range_requests.get(netloc, True): try: - package_info = PackageInfo.from_memory_wheel( - memory_wheel_from_url(link.filename, link.url, self.session) + package_info = PackageInfo.from_wheel_metadata( + metadata_from_wheel_url(link.filename, link.url, self.session) ) except HTTPRangeRequestUnsupported: # Do not set to False if we already know that the domain supports diff --git a/tests/inspection/conftest.py b/tests/inspection/conftest.py deleted file mode 100644 index 87edc940adf..00000000000 --- a/tests/inspection/conftest.py +++ /dev/null @@ -1,148 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import TYPE_CHECKING -from typing import Any -from typing import Dict -from typing import Protocol -from typing import Tuple -from urllib.parse import urlparse - -import pytest - -from requests import codes - - -if TYPE_CHECKING: - from collections.abc import Callable - - from httpretty.core import HTTPrettyRequest - - from tests.types import FixtureDirGetter - - HttPrettyResponse = Tuple[int, Dict[str, Any], bytes] # status code, headers, body - HttPrettyRequestCallback = Callable[ - [HTTPrettyRequest, str, Dict[str, Any]], HttPrettyResponse - ] - - class RequestCallbackFactory(Protocol): - def __call__( - self, - *, - accept_ranges: str | None = "bytes", - negative_offset_error: tuple[int, bytes] | None = None, - ) -> HttPrettyRequestCallback: ... - - -NEGATIVE_OFFSET_AS_POSITIVE = -1 - - -def build_head_response( - accept_ranges: str | None, content_length: int, response_headers: dict[str, Any] -) -> HttPrettyResponse: - response_headers["Content-Length"] = content_length - if accept_ranges: - response_headers["Accept-Ranges"] = accept_ranges - return 200, response_headers, b"" - - -def build_partial_response( - rng: str, - wheel_bytes: bytes, - response_headers: dict[str, Any], - *, - negative_offset_as_positive: bool = False, -) -> HttPrettyResponse: - status_code = 206 - response_headers["Accept-Ranges"] = "bytes" - total_length = len(wheel_bytes) - if rng.startswith("-"): - # negative offset - offset = int(rng) - if negative_offset_as_positive: - # some servers interpret a negative offset like "-10" as "0-10" - start = 0 - end = min(-offset, total_length - 1) - body = wheel_bytes[start : end + 1] - else: - start = total_length + offset - if start < 0: - # wheel is smaller than initial chunk size - return 200, response_headers, wheel_bytes - end = total_length - 1 - body = wheel_bytes[offset:] - else: - # range with start and end - rng_parts = rng.split("-") - start = int(rng_parts[0]) - end = int(rng_parts[1]) - body = wheel_bytes[start : end + 1] - response_headers["Content-Range"] = f"bytes {start}-{end}/{total_length}" - return status_code, response_headers, body - - -@pytest.fixture -def handle_request_factory(fixture_dir: FixtureDirGetter) -> RequestCallbackFactory: - def _factory( - *, - accept_ranges: str | None = "bytes", - negative_offset_error: tuple[int, bytes] | None = None, - ) -> HttPrettyRequestCallback: - def handle_request( - request: HTTPrettyRequest, uri: str, response_headers: dict[str, Any] - ) -> HttPrettyResponse: - name = Path(urlparse(uri).path).name - - wheel = Path(__file__).parent.parent.joinpath( - "repositories/fixtures/pypi.org/dists/" + name - ) - - if not wheel.exists(): - wheel = fixture_dir("distributions") / name - - if not wheel.exists(): - wheel = ( - fixture_dir("distributions") / "demo-0.1.0-py2.py3-none-any.whl" - ) - - wheel_bytes = wheel.read_bytes() - - del response_headers["status"] - - if request.method == "HEAD": - return build_head_response( - accept_ranges, len(wheel_bytes), response_headers - ) - - rng = request.headers.get("Range", "") - if rng: - rng = rng.split("=")[1] - - negative_offset_as_positive = False - if negative_offset_error and rng.startswith("-"): - if negative_offset_error[0] == codes.requested_range_not_satisfiable: - response_headers["Content-Range"] = f"bytes */{len(wheel_bytes)}" - if negative_offset_error[0] == NEGATIVE_OFFSET_AS_POSITIVE: - negative_offset_as_positive = True - else: - return ( - negative_offset_error[0], - response_headers, - negative_offset_error[1], - ) - if accept_ranges == "bytes" and rng: - return build_partial_response( - rng, - wheel_bytes, - response_headers, - negative_offset_as_positive=negative_offset_as_positive, - ) - - status_code = 200 - body = wheel_bytes - - return status_code, response_headers, body - - return handle_request - - return _factory diff --git a/tests/inspection/test_info.py b/tests/inspection/test_info.py index 3e59c7e518b..fed8efcf444 100644 --- a/tests/inspection/test_info.py +++ b/tests/inspection/test_info.py @@ -4,14 +4,14 @@ from subprocess import CalledProcessError from typing import TYPE_CHECKING +from zipfile import ZipFile import pytest -import requests + +from packaging.metadata import parse_email from poetry.inspection.info import PackageInfo from poetry.inspection.info import PackageInfoError -from poetry.inspection.lazy_wheel import MemoryWheel -from poetry.inspection.lazy_wheel import memory_wheel_from_url from poetry.utils.env import EnvCommandError from poetry.utils.env import VirtualEnv @@ -19,10 +19,9 @@ if TYPE_CHECKING: from pathlib import Path - from httpretty import httpretty + from packaging.metadata import RawMetadata from pytest_mock import MockerFixture - from tests.inspection.conftest import RequestCallbackFactory from tests.types import FixtureDirGetter @@ -42,15 +41,10 @@ def demo_wheel(fixture_dir: FixtureDirGetter) -> Path: @pytest.fixture -def demo_memory_wheel( - http: type[httpretty], - handle_request_factory: RequestCallbackFactory, -) -> MemoryWheel: - url = "https://foo.com/demo-0.1.0-py2.py3-none-any.whl" - request_callback = handle_request_factory() - http.register_uri(http.GET, url, body=request_callback) - - return memory_wheel_from_url("demo", url, requests.Session()) +def demo_wheel_metadata(demo_wheel: Path) -> RawMetadata: + with ZipFile(demo_wheel) as zf: + metadata, _ = parse_email(zf.read("demo-0.1.0.dist-info/METADATA")) + return metadata @pytest.fixture @@ -179,13 +173,28 @@ def test_info_from_wheel(demo_wheel: Path) -> None: assert info._source_url == demo_wheel.resolve().as_posix() -def test_info_from_memory_wheel(demo_memory_wheel: MemoryWheel) -> None: - info = PackageInfo.from_memory_wheel(demo_memory_wheel) +def test_info_from_wheel_metadata(demo_wheel_metadata: RawMetadata) -> None: + info = PackageInfo.from_wheel_metadata(demo_wheel_metadata) demo_check_info(info) + assert info.requires_python == ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" assert info._source_type is None assert info._source_url is None +def test_info_from_wheel_metadata_incomplete() -> None: + """ + To avoid differences in cached metadata, + it is important that the representation of missing fields does not change! + """ + metadata, _ = parse_email(b"Metadata-Version: 2.1\nName: demo\nVersion: 0.1.0\n") + info = PackageInfo.from_wheel_metadata(metadata) + assert info.name == "demo" + assert info.version == "0.1.0" + assert info.summary is None + assert info.requires_dist is None + assert info.requires_python is None + + def test_info_from_bdist(demo_wheel: Path) -> None: info = PackageInfo.from_bdist(demo_wheel) demo_check_info(info) diff --git a/tests/inspection/test_lazy_wheel.py b/tests/inspection/test_lazy_wheel.py index 6c21b86a71a..c3bf310e773 100644 --- a/tests/inspection/test_lazy_wheel.py +++ b/tests/inspection/test_lazy_wheel.py @@ -2,7 +2,13 @@ import re +from pathlib import Path from typing import TYPE_CHECKING +from typing import Any +from typing import Dict +from typing import Protocol +from typing import Tuple +from urllib.parse import urlparse import httpretty import pytest @@ -12,12 +18,142 @@ from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported from poetry.inspection.lazy_wheel import InvalidWheel -from poetry.inspection.lazy_wheel import memory_wheel_from_url -from tests.inspection.conftest import NEGATIVE_OFFSET_AS_POSITIVE +from poetry.inspection.lazy_wheel import metadata_from_wheel_url if TYPE_CHECKING: - from tests.inspection.conftest import RequestCallbackFactory + from collections.abc import Callable + + from httpretty.core import HTTPrettyRequest + + from tests.types import FixtureDirGetter + + HttPrettyResponse = Tuple[int, Dict[str, Any], bytes] # status code, headers, body + HttPrettyRequestCallback = Callable[ + [HTTPrettyRequest, str, Dict[str, Any]], HttPrettyResponse + ] + + class RequestCallbackFactory(Protocol): + def __call__( + self, + *, + accept_ranges: str | None = "bytes", + negative_offset_error: tuple[int, bytes] | None = None, + ) -> HttPrettyRequestCallback: ... + + +NEGATIVE_OFFSET_AS_POSITIVE = -1 + + +def build_head_response( + accept_ranges: str | None, content_length: int, response_headers: dict[str, Any] +) -> HttPrettyResponse: + response_headers["Content-Length"] = content_length + if accept_ranges: + response_headers["Accept-Ranges"] = accept_ranges + return 200, response_headers, b"" + + +def build_partial_response( + rng: str, + wheel_bytes: bytes, + response_headers: dict[str, Any], + *, + negative_offset_as_positive: bool = False, +) -> HttPrettyResponse: + status_code = 206 + response_headers["Accept-Ranges"] = "bytes" + total_length = len(wheel_bytes) + if rng.startswith("-"): + # negative offset + offset = int(rng) + if negative_offset_as_positive: + # some servers interpret a negative offset like "-10" as "0-10" + start = 0 + end = min(-offset, total_length - 1) + body = wheel_bytes[start : end + 1] + else: + start = total_length + offset + if start < 0: + # wheel is smaller than initial chunk size + return 200, response_headers, wheel_bytes + end = total_length - 1 + body = wheel_bytes[offset:] + else: + # range with start and end + rng_parts = rng.split("-") + start = int(rng_parts[0]) + end = int(rng_parts[1]) + body = wheel_bytes[start : end + 1] + response_headers["Content-Range"] = f"bytes {start}-{end}/{total_length}" + return status_code, response_headers, body + + +@pytest.fixture +def handle_request_factory(fixture_dir: FixtureDirGetter) -> RequestCallbackFactory: + def _factory( + *, + accept_ranges: str | None = "bytes", + negative_offset_error: tuple[int, bytes] | None = None, + ) -> HttPrettyRequestCallback: + def handle_request( + request: HTTPrettyRequest, uri: str, response_headers: dict[str, Any] + ) -> HttPrettyResponse: + name = Path(urlparse(uri).path).name + + wheel = Path(__file__).parent.parent.joinpath( + "repositories/fixtures/pypi.org/dists/" + name + ) + + if not wheel.exists(): + wheel = fixture_dir("distributions") / name + + if not wheel.exists(): + wheel = ( + fixture_dir("distributions") / "demo-0.1.0-py2.py3-none-any.whl" + ) + + wheel_bytes = wheel.read_bytes() + + del response_headers["status"] + + if request.method == "HEAD": + return build_head_response( + accept_ranges, len(wheel_bytes), response_headers + ) + + rng = request.headers.get("Range", "") + if rng: + rng = rng.split("=")[1] + + negative_offset_as_positive = False + if negative_offset_error and rng.startswith("-"): + if negative_offset_error[0] == codes.requested_range_not_satisfiable: + response_headers["Content-Range"] = f"bytes */{len(wheel_bytes)}" + if negative_offset_error[0] == NEGATIVE_OFFSET_AS_POSITIVE: + negative_offset_as_positive = True + else: + return ( + negative_offset_error[0], + response_headers, + negative_offset_error[1], + ) + if accept_ranges == "bytes" and rng: + return build_partial_response( + rng, + wheel_bytes, + response_headers, + negative_offset_as_positive=negative_offset_as_positive, + ) + + status_code = 200 + body = wheel_bytes + + return status_code, response_headers, body + + return handle_request + + return _factory @pytest.mark.parametrize( @@ -30,7 +166,7 @@ (NEGATIVE_OFFSET_AS_POSITIVE, b"handle negative offset as positive"), ], ) -def test_memory_wheel_from_url( +def test_metadata_from_wheel_url( http: type[httpretty.httpretty], handle_request_factory: RequestCallbackFactory, negative_offset_error: tuple[int, bytes] | None, @@ -47,12 +183,12 @@ def test_memory_wheel_from_url( url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" - wheel = memory_wheel_from_url("poetry-core", url, requests.Session()) + metadata = metadata_from_wheel_url("poetry-core", url, requests.Session()) - assert wheel.name == "poetry-core" - assert wheel.version == "1.5.0" - assert wheel.author == "Sébastien Eustace" - assert wheel.requires_dist == [ + assert metadata["name"] == "poetry-core" + assert metadata["version"] == "1.5.0" + assert metadata["author"] == "Sébastien Eustace" + assert metadata["requires_dist"] == [ 'importlib-metadata (>=1.7.0) ; python_version < "3.8"' ] @@ -78,14 +214,14 @@ def test_memory_wheel_from_url( # second wheel -> one less request if negative offsets are not supported latest_requests.clear() - memory_wheel_from_url("poetry-core", url, requests.Session()) + metadata_from_wheel_url("poetry-core", url, requests.Session()) expected_requests = min(expected_requests, 4) latest_requests = httpretty.latest_requests() assert len(latest_requests) == expected_requests @pytest.mark.parametrize("negative_offset_as_positive", [False, True]) -def test_memory_wheel_from_url_smaller_than_initial_chunk_size( +def test_metadata_from_wheel_url_smaller_than_initial_chunk_size( http: type[httpretty.httpretty], handle_request_factory: RequestCallbackFactory, negative_offset_as_positive: bool, @@ -102,12 +238,12 @@ def test_memory_wheel_from_url_smaller_than_initial_chunk_size( url = f"https://{domain}/zipp-3.5.0-py3-none-any.whl" - wheel = memory_wheel_from_url("zipp", url, requests.Session()) + metadata = metadata_from_wheel_url("zipp", url, requests.Session()) - assert wheel.name == "zipp" - assert wheel.version == "3.5.0" - assert wheel.author == "Jason R. Coombs" - assert len(wheel.requires_dist) == 12 + assert metadata["name"] == "zipp" + assert metadata["version"] == "3.5.0" + assert metadata["author"] == "Jason R. Coombs" + assert len(metadata["requires_dist"]) == 12 # only one request because server gives a normal response with the entire wheel # except for when server interprets negative offset as positive @@ -116,7 +252,7 @@ def test_memory_wheel_from_url_smaller_than_initial_chunk_size( @pytest.mark.parametrize("accept_ranges", [None, "none"]) -def test_memory_wheel_from_url_range_requests_not_supported_one_request( +def test_metadata_from_wheel_url_range_requests_not_supported_one_request( http: type[httpretty.httpretty], handle_request_factory: RequestCallbackFactory, accept_ranges: str | None, @@ -130,7 +266,7 @@ def test_memory_wheel_from_url_range_requests_not_supported_one_request( url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" with pytest.raises(HTTPRangeRequestUnsupported): - memory_wheel_from_url("poetry-core", url, requests.Session()) + metadata_from_wheel_url("poetry-core", url, requests.Session()) latest_requests = http.latest_requests() assert len(latest_requests) == 1 @@ -144,7 +280,7 @@ def test_memory_wheel_from_url_range_requests_not_supported_one_request( (codes.not_implemented, b"Unsupported client range"), ], ) -def test_memory_wheel_from_url_range_requests_not_supported_two_requests( +def test_metadata_from_wheel_url_range_requests_not_supported_two_requests( http: type[httpretty.httpretty], handle_request_factory: RequestCallbackFactory, negative_offset_error: tuple[int, bytes], @@ -160,7 +296,7 @@ def test_memory_wheel_from_url_range_requests_not_supported_two_requests( url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" with pytest.raises(HTTPRangeRequestUnsupported): - memory_wheel_from_url("poetry-core", url, requests.Session()) + metadata_from_wheel_url("poetry-core", url, requests.Session()) latest_requests = http.latest_requests() assert len(latest_requests) == 2 @@ -168,7 +304,7 @@ def test_memory_wheel_from_url_range_requests_not_supported_two_requests( assert latest_requests[1].method == "HEAD" -def test_memory_wheel_from_url_invalid_wheel( +def test_metadata_from_wheel_url_invalid_wheel( http: type[httpretty.httpretty], handle_request_factory: RequestCallbackFactory, ) -> None: @@ -181,7 +317,7 @@ def test_memory_wheel_from_url_invalid_wheel( url = f"https://{domain}/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl" with pytest.raises(InvalidWheel): - memory_wheel_from_url("demo-missing-dist-info", url, requests.Session()) + metadata_from_wheel_url("demo-missing-dist-info", url, requests.Session()) latest_requests = http.latest_requests() assert len(latest_requests) == 1 diff --git a/tests/repositories/test_http_repository.py b/tests/repositories/test_http_repository.py index 0eb0e9046e0..88d63171f86 100644 --- a/tests/repositories/test_http_repository.py +++ b/tests/repositories/test_http_repository.py @@ -5,11 +5,13 @@ from pathlib import Path from typing import TYPE_CHECKING from typing import Any +from zipfile import ZipFile import pytest +from packaging.metadata import parse_email + from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported -from poetry.inspection.lazy_wheel import MemoryWheel from poetry.repositories.http_repository import HTTPRepository from poetry.utils.helpers import HTTPRangeRequestSupported @@ -40,10 +42,12 @@ def test_get_info_from_wheel( ) -> None: filename = "poetry_core-1.5.0-py3-none-any.whl" filepath = MockRepository.DIST_FIXTURES / filename + with ZipFile(filepath) as zf: + metadata, _ = parse_email(zf.read("poetry_core-1.5.0.dist-info/METADATA")) - mock_memory_wheel_from_url = mocker.patch( - "poetry.repositories.http_repository.memory_wheel_from_url", - return_value=MemoryWheel(filepath), # type: ignore[arg-type] + mock_metadata_from_wheel_url = mocker.patch( + "poetry.repositories.http_repository.metadata_from_wheel_url", + return_value=metadata, ) mock_download = mocker.patch( "poetry.repositories.http_repository.download_file", @@ -65,11 +69,13 @@ def test_get_info_from_wheel( ] if lazy_wheel and (supports_range_requests or supports_range_requests is None): - mock_memory_wheel_from_url.assert_called_once_with(filename, url, repo.session) + mock_metadata_from_wheel_url.assert_called_once_with( + filename, url, repo.session + ) mock_download.assert_not_called() assert repo._supports_range_requests[domain] is True else: - mock_memory_wheel_from_url.assert_not_called() + mock_metadata_from_wheel_url.assert_not_called() mock_download.assert_called_once_with( url, mocker.ANY, session=repo.session, raise_accepts_ranges=lazy_wheel ) @@ -97,8 +103,8 @@ def test_get_info_from_wheel_state_sequence(mocker: MockerFixture) -> None: 6. Range requests are supported for some files: We try range requests (success). """ - mock_memory_wheel_from_url = mocker.patch( - "poetry.repositories.http_repository.memory_wheel_from_url" + mock_metadata_from_wheel_url = mocker.patch( + "poetry.repositories.http_repository.metadata_from_wheel_url" ) mock_download = mocker.patch("poetry.repositories.http_repository.download_file") @@ -108,37 +114,37 @@ def test_get_info_from_wheel_state_sequence(mocker: MockerFixture) -> None: repo = MockRepository() # 1. range request and download - mock_memory_wheel_from_url.side_effect = HTTPRangeRequestUnsupported + mock_metadata_from_wheel_url.side_effect = HTTPRangeRequestUnsupported repo._get_info_from_wheel(url) - assert mock_memory_wheel_from_url.call_count == 1 + assert mock_metadata_from_wheel_url.call_count == 1 assert mock_download.call_count == 1 # 2. only download repo._get_info_from_wheel(url) - assert mock_memory_wheel_from_url.call_count == 1 + assert mock_metadata_from_wheel_url.call_count == 1 assert mock_download.call_count == 2 # 3. range request and download - mock_memory_wheel_from_url.side_effect = None + mock_metadata_from_wheel_url.side_effect = None mock_download.side_effect = HTTPRangeRequestSupported repo._get_info_from_wheel(url) - assert mock_memory_wheel_from_url.call_count == 2 + assert mock_metadata_from_wheel_url.call_count == 2 assert mock_download.call_count == 3 # 4. only range request repo._get_info_from_wheel(url) - assert mock_memory_wheel_from_url.call_count == 3 + assert mock_metadata_from_wheel_url.call_count == 3 assert mock_download.call_count == 3 # 5. range request and download - mock_memory_wheel_from_url.side_effect = HTTPRangeRequestUnsupported + mock_metadata_from_wheel_url.side_effect = HTTPRangeRequestUnsupported mock_download.side_effect = None repo._get_info_from_wheel(url) - assert mock_memory_wheel_from_url.call_count == 4 + assert mock_metadata_from_wheel_url.call_count == 4 assert mock_download.call_count == 4 # 6. only range request - mock_memory_wheel_from_url.side_effect = None + mock_metadata_from_wheel_url.side_effect = None repo._get_info_from_wheel(url) - assert mock_memory_wheel_from_url.call_count == 5 + assert mock_metadata_from_wheel_url.call_count == 5 assert mock_download.call_count == 4 From 346b7dfea373389b58cacca559ebe5a098fc7865 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 30 Dec 2023 10:51:45 +0100 Subject: [PATCH 3/9] refactor: rename `from_metadata` to `from_metadata_directory`, rename `from_wheel_metadata` to `from_metadata` --- src/poetry/inspection/info.py | 38 +++++++++++----------- src/poetry/repositories/http_repository.py | 2 +- tests/inspection/test_info.py | 6 ++-- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/poetry/inspection/info.py b/src/poetry/inspection/info.py index 362cb6a7370..0c9cfb519d5 100644 --- a/src/poetry/inspection/info.py +++ b/src/poetry/inspection/info.py @@ -393,7 +393,22 @@ def _find_dist_info(path: Path) -> Iterator[Path]: yield Path(d) @classmethod - def from_metadata(cls, path: Path) -> PackageInfo | None: + def from_metadata(cls, metadata: RawMetadata) -> PackageInfo: + """ + Create package information from core metadata. + + :param metadata: raw metadata + """ + return cls( + name=metadata.get("name"), + version=metadata.get("version"), + summary=metadata.get("summary"), + requires_dist=metadata.get("requires_dist"), + requires_python=metadata.get("requires_python"), + ) + + @classmethod + def from_metadata_directory(cls, path: Path) -> PackageInfo | None: """ Helper method to parse package information from an unpacked metadata directory. @@ -475,7 +490,7 @@ def from_directory(cls, path: Path, disable_build: bool = False) -> PackageInfo: if project_package: info = cls.from_package(project_package) else: - info = cls.from_metadata(path) + info = cls.from_metadata_directory(path) if not info or info.requires_dist is None: try: @@ -521,21 +536,6 @@ def from_wheel(cls, path: Path) -> PackageInfo: except ValueError: return PackageInfo() - @classmethod - def from_wheel_metadata(cls, metadata: RawMetadata) -> PackageInfo: - """ - Gather package information from metadata of a remote wheel. - - :param metadata: metadata of the wheel. - """ - return cls( - name=metadata.get("name"), - version=metadata.get("version"), - summary=metadata.get("summary"), - requires_dist=metadata.get("requires_dist"), - requires_python=metadata.get("requires_python"), - ) - @classmethod def from_bdist(cls, path: Path) -> PackageInfo: """ @@ -599,7 +599,7 @@ def get_pep517_metadata(path: Path) -> PackageInfo: *PEP517_META_BUILD_DEPS, ) venv.run_python_script(pep517_meta_build_script) - info = PackageInfo.from_metadata(dest_dir) + info = PackageInfo.from_metadata_directory(dest_dir) except EnvCommandError as e: # something went wrong while attempting pep517 metadata build # fallback to egg_info if setup.py available @@ -616,7 +616,7 @@ def get_pep517_metadata(path: Path) -> PackageInfo: os.chdir(path) try: venv.run("python", "setup.py", "egg_info") - info = PackageInfo.from_metadata(path) + info = PackageInfo.from_metadata_directory(path) except EnvCommandError as fbe: raise PackageInfoError( path, e, "Fallback egg_info generation failed.", fbe diff --git a/src/poetry/repositories/http_repository.py b/src/poetry/repositories/http_repository.py index 9c89ec83ee1..61f1a088dac 100644 --- a/src/poetry/repositories/http_repository.py +++ b/src/poetry/repositories/http_repository.py @@ -118,7 +118,7 @@ def _get_info_from_wheel(self, url: str) -> PackageInfo: # or we don't know yet, we try range requests. if self._lazy_wheel and self._supports_range_requests.get(netloc, True): try: - package_info = PackageInfo.from_wheel_metadata( + package_info = PackageInfo.from_metadata( metadata_from_wheel_url(link.filename, link.url, self.session) ) except HTTPRangeRequestUnsupported: diff --git a/tests/inspection/test_info.py b/tests/inspection/test_info.py index fed8efcf444..933d48a7f0e 100644 --- a/tests/inspection/test_info.py +++ b/tests/inspection/test_info.py @@ -174,7 +174,7 @@ def test_info_from_wheel(demo_wheel: Path) -> None: def test_info_from_wheel_metadata(demo_wheel_metadata: RawMetadata) -> None: - info = PackageInfo.from_wheel_metadata(demo_wheel_metadata) + info = PackageInfo.from_metadata(demo_wheel_metadata) demo_check_info(info) assert info.requires_python == ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" assert info._source_type is None @@ -187,7 +187,7 @@ def test_info_from_wheel_metadata_incomplete() -> None: it is important that the representation of missing fields does not change! """ metadata, _ = parse_email(b"Metadata-Version: 2.1\nName: demo\nVersion: 0.1.0\n") - info = PackageInfo.from_wheel_metadata(metadata) + info = PackageInfo.from_metadata(metadata) assert info.name == "demo" assert info.version == "0.1.0" assert info.summary is None @@ -228,7 +228,7 @@ def test_info_from_poetry_directory_fallback_on_poetry_create_error( def test_info_from_requires_txt(fixture_dir: FixtureDirGetter) -> None: - info = PackageInfo.from_metadata( + info = PackageInfo.from_metadata_directory( fixture_dir("inspection") / "demo_only_requires_txt.egg-info" ) assert info is not None From f25832f293fba4d56a4758f11779b236d70a131d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Tue, 9 Jan 2024 16:47:44 +0100 Subject: [PATCH 4/9] apply review feedback --- src/poetry/console/commands/config.py | 2 +- src/poetry/inspection/lazy_wheel.py | 41 +++++++++++++++------- tests/inspection/test_lazy_wheel.py | 28 +++++++-------- tests/repositories/test_http_repository.py | 2 +- 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/poetry/console/commands/config.py b/src/poetry/console/commands/config.py index ca5880a01e2..13e8b2262a0 100644 --- a/src/poetry/console/commands/config.py +++ b/src/poetry/console/commands/config.py @@ -68,7 +68,7 @@ def unique_config_values(self) -> dict[str, tuple[Any, Any]]: ), "virtualenvs.path": (str, lambda val: str(Path(val))), "virtualenvs.prefer-active-python": (boolean_validator, boolean_normalizer), - "virtualenvs.prompt": (str, lambda val: str(val)), + "virtualenvs.prompt": (str, str), "experimental.system-git-client": (boolean_validator, boolean_normalizer), "installer.modern-installation": (boolean_validator, boolean_normalizer), "installer.parallel": (boolean_validator, boolean_normalizer), diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py index 0361d788594..a0b2bcdccda 100644 --- a/src/poetry/inspection/lazy_wheel.py +++ b/src/poetry/inspection/lazy_wheel.py @@ -30,6 +30,7 @@ if TYPE_CHECKING: from collections.abc import Iterable from collections.abc import Iterator + from types import TracebackType from packaging.metadata import RawMetadata from requests import Session @@ -51,12 +52,12 @@ class UnsupportedWheel(Exception): class InvalidWheel(Exception): """Invalid (e.g. corrupt) wheel.""" - def __init__(self, location: str, name: str): + def __init__(self, location: str, name: str) -> None: self.location = location self.name = name def __str__(self) -> str: - return f"Wheel '{self.name}' located at {self.location} is invalid." + return f"Wheel {self.name} located at {self.location} is invalid." def metadata_from_wheel_url( @@ -240,8 +241,13 @@ def __enter__(self) -> ReadOnlyIOWrapper: self._file.__enter__() return self - def __exit__(self, *exc: Any) -> None: - self._file.__exit__(*exc) + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + self._file.__exit__(exc_type, exc_value, traceback) def __iter__(self) -> Iterator[bytes]: raise NotImplementedError @@ -298,10 +304,15 @@ def __enter__(self) -> LazyRemoteResource: self._setup_content() return self - def __exit__(self, *exc: Any) -> None: + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: """Call ``self._reset_content()``.""" self._reset_content() - super().__exit__(*exc) + super().__exit__(exc_type, exc_value, traceback) @contextmanager def _stay(self) -> Iterator[None]: @@ -455,10 +466,15 @@ def __enter__(self) -> LazyHTTPFile: super().__enter__() return self - def __exit__(self, *exc: Any) -> None: + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: """Logs request count to quickly identify any pathological cases in log data.""" logger.debug("%d requests for url %s", self._request_count, self._url) - super().__exit__(*exc) + super().__exit__(exc_type, exc_value, traceback) def _content_length_from_head(self) -> int: """Performs a HEAD request to extract the Content-Length. @@ -514,6 +530,8 @@ class LazyWheelOverHTTP(LazyHTTPFile): # multiple times in the same invocation against an index without this support. _domains_without_negative_range: ClassVar[set[str]] = set() + _metadata_regex = re.compile(r"^[^/]*\.dist-info/METADATA$") + # This override is needed for mypy so we can call ``.prefetch_metadata()`` # within a ``with`` block. def __enter__(self) -> LazyWheelOverHTTP: @@ -711,7 +729,6 @@ def _prefetch_metadata(self, name: str) -> str: can be downloaded in a single ranged GET request.""" logger.debug("begin prefetching METADATA for %s", name) - metadata_regex = re.compile(r"^[^/]*\.dist-info/METADATA$") start: int | None = None end: int | None = None @@ -723,19 +740,19 @@ def _prefetch_metadata(self, name: str) -> str: filename = "" for info in zf.infolist(): if start is None: - if metadata_regex.search(info.filename): + if self._metadata_regex.search(info.filename): filename = info.filename start = info.header_offset continue else: # The last .dist-info/ entry may be before the end of the file if the # wheel's entries are sorted lexicographically (which is unusual). - if not metadata_regex.search(info.filename): + if not self._metadata_regex.search(info.filename): end = info.header_offset break if start is None: raise UnsupportedWheel( - f"no {metadata_regex!r} found for {name} in {self.name}" + f"no {self._metadata_regex!r} found for {name} in {self.name}" ) # If it is the last entry of the zip, then give us everything # until the start of the central directory. diff --git a/tests/inspection/test_lazy_wheel.py b/tests/inspection/test_lazy_wheel.py index c3bf310e773..b32bdd48d4f 100644 --- a/tests/inspection/test_lazy_wheel.py +++ b/tests/inspection/test_lazy_wheel.py @@ -28,9 +28,9 @@ from tests.types import FixtureDirGetter - HttPrettyResponse = Tuple[int, Dict[str, Any], bytes] # status code, headers, body - HttPrettyRequestCallback = Callable[ - [HTTPrettyRequest, str, Dict[str, Any]], HttPrettyResponse + HTTPrettyResponse = Tuple[int, Dict[str, Any], bytes] # status code, headers, body + HTTPrettyRequestCallback = Callable[ + [HTTPrettyRequest, str, Dict[str, Any]], HTTPrettyResponse ] class RequestCallbackFactory(Protocol): @@ -39,7 +39,7 @@ def __call__( *, accept_ranges: str | None = "bytes", negative_offset_error: tuple[int, bytes] | None = None, - ) -> HttPrettyRequestCallback: ... + ) -> HTTPrettyRequestCallback: ... NEGATIVE_OFFSET_AS_POSITIVE = -1 @@ -47,7 +47,7 @@ def __call__( def build_head_response( accept_ranges: str | None, content_length: int, response_headers: dict[str, Any] -) -> HttPrettyResponse: +) -> HTTPrettyResponse: response_headers["Content-Length"] = content_length if accept_ranges: response_headers["Accept-Ranges"] = accept_ranges @@ -60,7 +60,7 @@ def build_partial_response( response_headers: dict[str, Any], *, negative_offset_as_positive: bool = False, -) -> HttPrettyResponse: +) -> HTTPrettyResponse: status_code = 206 response_headers["Accept-Ranges"] = "bytes" total_length = len(wheel_bytes) @@ -81,9 +81,7 @@ def build_partial_response( body = wheel_bytes[offset:] else: # range with start and end - rng_parts = rng.split("-") - start = int(rng_parts[0]) - end = int(rng_parts[1]) + start, end = map(int, rng.split("-")) body = wheel_bytes[start : end + 1] response_headers["Content-Range"] = f"bytes {start}-{end}/{total_length}" return status_code, response_headers, body @@ -95,13 +93,13 @@ def _factory( *, accept_ranges: str | None = "bytes", negative_offset_error: tuple[int, bytes] | None = None, - ) -> HttPrettyRequestCallback: + ) -> HTTPrettyRequestCallback: def handle_request( request: HTTPrettyRequest, uri: str, response_headers: dict[str, Any] - ) -> HttPrettyResponse: + ) -> HTTPrettyResponse: name = Path(urlparse(uri).path).name - wheel = Path(__file__).parent.parent.joinpath( + wheel = Path(__file__).parents[1] / ( "repositories/fixtures/pypi.org/dists/" + name ) @@ -122,9 +120,7 @@ def handle_request( accept_ranges, len(wheel_bytes), response_headers ) - rng = request.headers.get("Range", "") - if rng: - rng = rng.split("=")[1] + rng = request.headers.get("Range", "=").split("=")[1] negative_offset_as_positive = False if negative_offset_error and rng.startswith("-"): @@ -226,7 +222,7 @@ def test_metadata_from_wheel_url_smaller_than_initial_chunk_size( handle_request_factory: RequestCallbackFactory, negative_offset_as_positive: bool, ) -> None: - domain = f"tiny-wheel-{str(negative_offset_as_positive).lower()}.com" + domain = f"tiny-wheel-{str(negative_offset_as_positive).casefold()}.com" uri_regex = re.compile(f"^https://{domain}/.*$") request_callback = handle_request_factory( negative_offset_error=( diff --git a/tests/repositories/test_http_repository.py b/tests/repositories/test_http_repository.py index 88d63171f86..d6591d42301 100644 --- a/tests/repositories/test_http_repository.py +++ b/tests/repositories/test_http_repository.py @@ -68,7 +68,7 @@ def test_get_info_from_wheel( 'importlib-metadata (>=1.7.0) ; python_version < "3.8"' ] - if lazy_wheel and (supports_range_requests or supports_range_requests is None): + if lazy_wheel and supports_range_requests is not False: mock_metadata_from_wheel_url.assert_called_once_with( filename, url, repo.session ) From 31b5c065ab75faf4b89c08b982882f23bc81cdb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 20 Jan 2024 16:50:08 +0100 Subject: [PATCH 5/9] remove one level of inheritance: merge LazyRemoteResource and FixedSizeLazyResource to LazyRemoteFile --- src/poetry/inspection/lazy_wheel.py | 188 ++++++++++++---------------- 1 file changed, 83 insertions(+), 105 deletions(-) diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py index a0b2bcdccda..815546a533f 100644 --- a/src/poetry/inspection/lazy_wheel.py +++ b/src/poetry/inspection/lazy_wheel.py @@ -256,7 +256,7 @@ def __next__(self) -> bytes: raise NotImplementedError -class LazyRemoteResource(ReadOnlyIOWrapper): +class LazyRemoteFile(ReadOnlyIOWrapper): """Abstract class for a binary file object that lazily downloads its contents. This class uses a ``MergeIntervals`` instance to keep track of what it's downloaded. @@ -265,41 +265,9 @@ class LazyRemoteResource(ReadOnlyIOWrapper): def __init__(self, inner: BinaryIO) -> None: super().__init__(inner) self._merge_intervals: MergeIntervals | None = None + self._length: int | None = None - @abc.abstractmethod - def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: - """Call to the remote backend to provide this byte range in one or more chunks. - - NB: For compatibility with HTTP range requests, the range provided to this - method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2 - bytes long, and the range can never be empty). - - :raises Exception: implementations may raise an exception for e.g. intermittent - errors accessing the remote resource. - """ - ... - - def _setup_content(self) -> None: - """Ensure ``self._merge_intervals`` is initialized. - - Called in ``__enter__``, and should make recursive invocations into a no-op. - Subclasses may override this method.""" - if self._merge_intervals is None: - self._merge_intervals = MergeIntervals() - - def _reset_content(self) -> None: - """Unset ``self._merge_intervals``. - - Called in ``__exit__``, and should make recursive invocations into a no-op. - Subclasses may override this method.""" - if self._merge_intervals is not None: - logger.debug( - "unsetting merge intervals (were: %s)", repr(self._merge_intervals) - ) - self._merge_intervals = None - - def __enter__(self) -> LazyRemoteResource: - """Call ``self._setup_content()``, then return self.""" + def __enter__(self) -> LazyRemoteFile: super().__enter__() self._setup_content() return self @@ -310,49 +278,32 @@ def __exit__( exc_value: BaseException | None, traceback: TracebackType | None, ) -> None: - """Call ``self._reset_content()``.""" self._reset_content() super().__exit__(exc_type, exc_value, traceback) - @contextmanager - def _stay(self) -> Iterator[None]: - """Return a context manager keeping the position. - - At the end of the block, seek back to original position. - """ - pos = self.tell() - try: - yield - finally: - self.seek(pos) + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. - def ensure_downloaded(self, start: int, end: int) -> None: - """Ensures bytes start to end (inclusive) have been downloaded and written to - the backing file. + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. :raises ValueError: if ``__enter__`` was not called beforehand. """ - if self._merge_intervals is None: - raise ValueError(".__enter__() must be called to set up merge intervals") - # Reducing by 1 to get an inclusive end range. - end -= 1 - with self._stay(): - for ( - range_start, - range_end, - ) in self._merge_intervals.minimal_intervals_covering(start, end): - self.seek(start) - for chunk in self.fetch_content_range(range_start, range_end): - self._file.write(chunk) - - -class FixedSizeLazyResource(LazyRemoteResource): - """Fetches a fixed content length from the remote server when initialized in order - to support ``.read(-1)`` and seek calls against ``io.SEEK_END``.""" - - def __init__(self, inner: BinaryIO) -> None: - super().__init__(inner) - self._length: int | None = None + if self._length is None: + raise ValueError(".__enter__() must be called to set up content length") + cur = self.tell() + logger.debug("read size %d at %d from lazy file %s", size, cur, self.name) + if size < 0: + assert cur <= self._length + download_size = self._length - cur + elif size == 0: + return b"" + else: + download_size = size + stop = min(cur + download_size, self._length) + self._ensure_downloaded(cur, stop) + return super().read(download_size) @abc.abstractmethod def _fetch_content_length(self) -> int: @@ -364,69 +315,96 @@ def _fetch_content_length(self) -> int: :raises Exception: implementations may raise any type of exception if the length value could not be parsed, or any other issue which might - cause valid calls to ``self.fetch_content_range()`` to fail. + cause valid calls to ``self._fetch_content_range()`` to fail. + """ + raise NotImplementedError + + @abc.abstractmethod + def _fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Call to the remote backend to provide this byte range in one or more chunks. + + NB: For compatibility with HTTP range requests, the range provided to this + method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2 + bytes long, and the range can never be empty). + + :raises Exception: implementations may raise an exception for e.g. intermittent + errors accessing the remote resource. """ raise NotImplementedError def _setup_content(self) -> None: """Initialize the internal length field and other bookkeeping. + Ensure ``self._merge_intervals`` is initialized. + After parsing the remote file length with ``self._fetch_content_length()``, this method will truncate the underlying file from parent abstract class ``ReadOnlyIOWrapper`` to that size in order to support seek operations against ``io.SEEK_END`` in ``self.read()``. - This method is called in ``__enter__``. - """ - super()._setup_content() + Called in ``__enter__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is None: + self._merge_intervals = MergeIntervals() if self._length is None: logger.debug("begin fetching content length") self._length = self._fetch_content_length() logger.debug("done fetching content length (is: %d)", self._length) + # Enable us to seek and write anywhere in the backing file up to this + # known length. + self.truncate(self._length) else: logger.debug("content length already fetched (is: %d)", self._length) - # Enable us to seek and write anywhere in the backing file up to this - # known length. - self.truncate(self._length) - def _reset_content(self) -> None: - """Unset the internal length field and other bookkeeping. + """Unset the internal length field and merge intervals. - This method is called in ``__exit__``.""" - super()._reset_content() + Called in ``__exit__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is not None: + logger.debug( + "unsetting merge intervals (were: %s)", repr(self._merge_intervals) + ) + self._merge_intervals = None if self._length is not None: logger.debug("unsetting content length (was: %d)", self._length) self._length = None - def read(self, size: int = -1) -> bytes: - """Read up to size bytes from the object and return them. + @contextmanager + def _stay(self) -> Iterator[None]: + """Return a context manager keeping the position. - As a convenience, if size is unspecified or -1, - all bytes until EOF are returned. Fewer than - size bytes may be returned if EOF is reached. + At the end of the block, seek back to original position. + """ + pos = self.tell() + try: + yield + finally: + self.seek(pos) + + def _ensure_downloaded(self, start: int, end: int) -> None: + """Ensures bytes start to end (inclusive) have been downloaded and written to + the backing file. :raises ValueError: if ``__enter__`` was not called beforehand. """ - if self._length is None: - raise ValueError(".__enter__() must be called to set up content length") - cur = self.tell() - logger.debug("read size %d at %d from lazy file %s", size, cur, self.name) - if size < 0: - assert cur <= self._length - download_size = self._length - cur - elif size == 0: - return b"" - else: - download_size = size - stop = min(cur + download_size, self._length) - self.ensure_downloaded(cur, stop) - return super().read(download_size) + if self._merge_intervals is None: + raise ValueError(".__enter__() must be called to set up merge intervals") + # Reducing by 1 to get an inclusive end range. + end -= 1 + with self._stay(): + for ( + range_start, + range_end, + ) in self._merge_intervals.minimal_intervals_covering(start, end): + self.seek(start) + for chunk in self._fetch_content_range(range_start, range_end): + self._file.write(chunk) -class LazyHTTPFile(FixedSizeLazyResource): +class LazyHTTPFile(LazyRemoteFile): """File-like object representing a fixed-length file over HTTP. This uses HTTP range requests to lazily fetch the file's content into a temporary @@ -451,7 +429,7 @@ def _fetch_content_length(self) -> int: # again in LazyWheelOverHTTP. return self._content_length_from_head() - def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + def _fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: """Perform a series of HTTP range requests to cover the specified byte range.""" yield from self._stream_response(start, end).iter_content(CONTENT_CHUNK_SIZE) @@ -588,7 +566,7 @@ def _fetch_content_length(self) -> int: # size), then download all of this at once, hopefully pulling in the entire # central directory. initial_start = max(0, ret_length - initial_chunk_size) - self.ensure_downloaded(initial_start, ret_length) + self._ensure_downloaded(initial_start, ret_length) else: # If we *could* download some file contents, then write them to the end of # the file and set up our bisect boundaries by hand. @@ -759,7 +737,7 @@ def _prefetch_metadata(self, name: str) -> str: if end is None: end = zf.start_dir logger.debug(f"fetch {filename}") - self.ensure_downloaded(start, end) + self._ensure_downloaded(start, end) logger.debug("done prefetching METADATA for %s", name) return filename From 44a2570626bf71abf68b53a3cf225e235fea0580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 20 Jan 2024 17:11:01 +0100 Subject: [PATCH 6/9] remove one level of inheritance: merge LazyRemoteFile and LazyHTTPFile to LazyFileOverHTTP --- src/poetry/inspection/lazy_wheel.py | 197 ++++++++++------------------ 1 file changed, 72 insertions(+), 125 deletions(-) diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py index 815546a533f..95d19ef0311 100644 --- a/src/poetry/inspection/lazy_wheel.py +++ b/src/poetry/inspection/lazy_wheel.py @@ -2,7 +2,6 @@ from __future__ import annotations -import abc import io import logging import re @@ -256,18 +255,29 @@ def __next__(self) -> bytes: raise NotImplementedError -class LazyRemoteFile(ReadOnlyIOWrapper): - """Abstract class for a binary file object that lazily downloads its contents. +class LazyFileOverHTTP(ReadOnlyIOWrapper): + """File-like object representing a fixed-length file over HTTP. - This class uses a ``MergeIntervals`` instance to keep track of what it's downloaded. - """ + This uses HTTP range requests to lazily fetch the file's content into a temporary + file. If such requests are not supported by the server, raises + ``HTTPRangeRequestUnsupported`` in the ``__enter__`` method.""" + + def __init__( + self, + url: str, + session: Session | Authenticator, + delete_backing_file: bool = True, + ) -> None: + super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file))) - def __init__(self, inner: BinaryIO) -> None: - super().__init__(inner) self._merge_intervals: MergeIntervals | None = None self._length: int | None = None - def __enter__(self) -> LazyRemoteFile: + self._request_count = 0 + self._session = session + self._url = url + + def __enter__(self) -> LazyFileOverHTTP: super().__enter__() self._setup_content() return self @@ -305,32 +315,20 @@ def read(self, size: int = -1) -> bytes: self._ensure_downloaded(cur, stop) return super().read(download_size) - @abc.abstractmethod - def _fetch_content_length(self) -> int: - """Query the remote resource for the total length of the file. - - This method may also mutate internal state, such as writing to the backing - file. It's marked private because it will only be called within this class's - ``__enter__`` implementation to populate an internal length field. - - :raises Exception: implementations may raise any type of exception if the length - value could not be parsed, or any other issue which might - cause valid calls to ``self._fetch_content_range()`` to fail. - """ - raise NotImplementedError - - @abc.abstractmethod - def _fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: - """Call to the remote backend to provide this byte range in one or more chunks. + @classmethod + def _uncached_headers(cls) -> dict[str, str]: + """HTTP headers to bypass any HTTP caching. - NB: For compatibility with HTTP range requests, the range provided to this - method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2 - bytes long, and the range can never be empty). + The requests we perform in this file are intentionally small, and any caching + should be done at a higher level. - :raises Exception: implementations may raise an exception for e.g. intermittent - errors accessing the remote resource. + Further, caching partial requests might cause issues: + https://github.com/pypa/pip/pull/8716 """ - raise NotImplementedError + # "no-cache" is the correct value for "up to date every time", so this will also + # ensure we get the most recent value from the server: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching#provide_up-to-date_content_every_time + return {"Accept-Encoding": "identity", "Cache-Control": "no-cache"} def _setup_content(self) -> None: """Initialize the internal length field and other bookkeeping. @@ -372,88 +370,6 @@ def _reset_content(self) -> None: logger.debug("unsetting content length (was: %d)", self._length) self._length = None - @contextmanager - def _stay(self) -> Iterator[None]: - """Return a context manager keeping the position. - - At the end of the block, seek back to original position. - """ - pos = self.tell() - try: - yield - finally: - self.seek(pos) - - def _ensure_downloaded(self, start: int, end: int) -> None: - """Ensures bytes start to end (inclusive) have been downloaded and written to - the backing file. - - :raises ValueError: if ``__enter__`` was not called beforehand. - """ - if self._merge_intervals is None: - raise ValueError(".__enter__() must be called to set up merge intervals") - # Reducing by 1 to get an inclusive end range. - end -= 1 - with self._stay(): - for ( - range_start, - range_end, - ) in self._merge_intervals.minimal_intervals_covering(start, end): - self.seek(start) - for chunk in self._fetch_content_range(range_start, range_end): - self._file.write(chunk) - - -class LazyHTTPFile(LazyRemoteFile): - """File-like object representing a fixed-length file over HTTP. - - This uses HTTP range requests to lazily fetch the file's content into a temporary - file. If such requests are not supported by the server, raises - ``HTTPRangeRequestUnsupported`` in the ``__enter__`` method.""" - - def __init__( - self, - url: str, - session: Session | Authenticator, - delete_backing_file: bool = True, - ) -> None: - super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file))) - - self._request_count = 0 - self._session = session - self._url = url - - def _fetch_content_length(self) -> int: - """Get the remote file's length from a HEAD request.""" - # NB: This is currently dead code, as _fetch_content_length() is overridden - # again in LazyWheelOverHTTP. - return self._content_length_from_head() - - def _fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: - """Perform a series of HTTP range requests to cover the specified byte range.""" - yield from self._stream_response(start, end).iter_content(CONTENT_CHUNK_SIZE) - - def __enter__(self) -> LazyHTTPFile: - """Fetch the remote file length and reset the log of downloaded intervals. - - This method must be called before ``.read()``. - - :raises HTTPRangeRequestUnsupported: if the remote server fails to indicate - support for "bytes" ranges. - """ - super().__enter__() - return self - - def __exit__( - self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - traceback: TracebackType | None, - ) -> None: - """Logs request count to quickly identify any pathological cases in log data.""" - logger.debug("%d requests for url %s", self._request_count, self._url) - super().__exit__(exc_type, exc_value, traceback) - def _content_length_from_head(self) -> int: """Performs a HEAD request to extract the Content-Length. @@ -470,6 +386,12 @@ def _content_length_from_head(self) -> int: ) return int(head.headers["Content-Length"]) + def _fetch_content_length(self) -> int: + """Get the remote file's length.""" + # NB: This is currently dead code, as _fetch_content_length() is overridden + # again in LazyWheelOverHTTP. + return self._content_length_from_head() + def _stream_response(self, start: int, end: int) -> Response: """Return streaming HTTP response to a range request from start to end.""" headers = self._uncached_headers() @@ -481,23 +403,48 @@ def _stream_response(self, start: int, end: int) -> Response: assert int(response.headers["Content-Length"]) == (end - start + 1) return response - @classmethod - def _uncached_headers(cls) -> dict[str, str]: - """HTTP headers to bypass any HTTP caching. + def _fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Perform a series of HTTP range requests to cover the specified byte range. - The requests we perform in this file are intentionally small, and any caching - should be done at a higher level. + NB: For compatibility with HTTP range requests, the range provided to this + method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2 + bytes long, and the range can never be empty). + """ + yield from self._stream_response(start, end).iter_content(CONTENT_CHUNK_SIZE) - Further, caching partial requests might cause issues: - https://github.com/pypa/pip/pull/8716 + @contextmanager + def _stay(self) -> Iterator[None]: + """Return a context manager keeping the position. + + At the end of the block, seek back to original position. """ - # "no-cache" is the correct value for "up to date every time", so this will also - # ensure we get the most recent value from the server: - # https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching#provide_up-to-date_content_every_time - return {"Accept-Encoding": "identity", "Cache-Control": "no-cache"} + pos = self.tell() + try: + yield + finally: + self.seek(pos) + + def _ensure_downloaded(self, start: int, end: int) -> None: + """Ensures bytes start to end (inclusive) have been downloaded and written to + the backing file. + + :raises ValueError: if ``__enter__`` was not called beforehand. + """ + if self._merge_intervals is None: + raise ValueError(".__enter__() must be called to set up merge intervals") + # Reducing by 1 to get an inclusive end range. + end -= 1 + with self._stay(): + for ( + range_start, + range_end, + ) in self._merge_intervals.minimal_intervals_covering(start, end): + self.seek(start) + for chunk in self._fetch_content_range(range_start, range_end): + self._file.write(chunk) -class LazyWheelOverHTTP(LazyHTTPFile): +class LazyWheelOverHTTP(LazyFileOverHTTP): """File-like object mapped to a ZIP file over HTTP. This uses HTTP range requests to lazily fetch the file's content, which should be From ec2d0f344b7c8424fa6f62a1f15e126c5447b614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 20 Jan 2024 17:13:45 +0100 Subject: [PATCH 7/9] consistent order of methods: magic methods, public methods, internal methods --- src/poetry/inspection/lazy_wheel.py | 50 ++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py index 95d19ef0311..141f75ee296 100644 --- a/src/poetry/inspection/lazy_wheel.py +++ b/src/poetry/inspection/lazy_wheel.py @@ -150,6 +150,24 @@ class ReadOnlyIOWrapper(BinaryIO): def __init__(self, inner: BinaryIO) -> None: self._file = inner + def __enter__(self) -> ReadOnlyIOWrapper: + self._file.__enter__() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + self._file.__exit__(exc_type, exc_value, traceback) + + def __iter__(self) -> Iterator[bytes]: + raise NotImplementedError + + def __next__(self) -> bytes: + raise NotImplementedError + @property def mode(self) -> str: """Opening mode, which is always rb.""" @@ -236,24 +254,6 @@ def write(self, s: Any) -> int: def writelines(self, lines: Iterable[Any]) -> None: raise NotImplementedError - def __enter__(self) -> ReadOnlyIOWrapper: - self._file.__enter__() - return self - - def __exit__( - self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - traceback: TracebackType | None, - ) -> None: - self._file.__exit__(exc_type, exc_value, traceback) - - def __iter__(self) -> Iterator[bytes]: - raise NotImplementedError - - def __next__(self) -> bytes: - raise NotImplementedError - class LazyFileOverHTTP(ReadOnlyIOWrapper): """File-like object representing a fixed-length file over HTTP. @@ -467,6 +467,13 @@ def __enter__(self) -> LazyWheelOverHTTP: super().__enter__() return self + def read_metadata(self, name: str) -> bytes: + """Download and read the METADATA file from the remote wheel.""" + with ZipFile(self) as zf: + # prefetch metadata to reduce the number of range requests + filename = self._prefetch_metadata(name) + return zf.read(filename) + @classmethod def _initial_chunk_length(cls) -> int: """Return the size of the chunk (in bytes) to download from the end of the file. @@ -688,10 +695,3 @@ def _prefetch_metadata(self, name: str) -> str: logger.debug("done prefetching METADATA for %s", name) return filename - - def read_metadata(self, name: str) -> bytes: - """Download and read the METADATA file from the remote wheel.""" - with ZipFile(self) as zf: - # prefetch metadata to reduce the number of range requests - filename = self._prefetch_metadata(name) - return zf.read(filename) From 8309ad22641e0b22c3ca66aeeade69922396ca23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 20 Jan 2024 17:24:41 +0100 Subject: [PATCH 8/9] use type vars to avoid override only done for type checking --- src/poetry/inspection/lazy_wheel.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py index 141f75ee296..afc0ea570ac 100644 --- a/src/poetry/inspection/lazy_wheel.py +++ b/src/poetry/inspection/lazy_wheel.py @@ -14,6 +14,7 @@ from typing import Any from typing import BinaryIO from typing import ClassVar +from typing import TypeVar from typing import cast from urllib.parse import urlparse from zipfile import BadZipFile @@ -140,6 +141,9 @@ def minimal_intervals_covering( yield from self._merge(start, end, left, right) +T = TypeVar("T", bound="ReadOnlyIOWrapper") + + class ReadOnlyIOWrapper(BinaryIO): """Implement read-side ``BinaryIO`` methods wrapping an inner ``BinaryIO``. @@ -150,7 +154,7 @@ class ReadOnlyIOWrapper(BinaryIO): def __init__(self, inner: BinaryIO) -> None: self._file = inner - def __enter__(self) -> ReadOnlyIOWrapper: + def __enter__(self: T) -> T: self._file.__enter__() return self @@ -255,6 +259,9 @@ def writelines(self, lines: Iterable[Any]) -> None: raise NotImplementedError +U = TypeVar("U", bound="LazyFileOverHTTP") + + class LazyFileOverHTTP(ReadOnlyIOWrapper): """File-like object representing a fixed-length file over HTTP. @@ -277,7 +284,7 @@ def __init__( self._session = session self._url = url - def __enter__(self) -> LazyFileOverHTTP: + def __enter__(self: U) -> U: super().__enter__() self._setup_content() return self @@ -457,16 +464,6 @@ class LazyWheelOverHTTP(LazyFileOverHTTP): _metadata_regex = re.compile(r"^[^/]*\.dist-info/METADATA$") - # This override is needed for mypy so we can call ``.prefetch_metadata()`` - # within a ``with`` block. - def __enter__(self) -> LazyWheelOverHTTP: - """Fetch the remote file length and reset the log of downloaded intervals. - - This method must be called before ``.read()`` or ``.prefetch_metadata()``. - """ - super().__enter__() - return self - def read_metadata(self, name: str) -> bytes: """Download and read the METADATA file from the remote wheel.""" with ZipFile(self) as zf: From 48e0b573e92499d1609d09b972beb7339285fd10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 20 Jan 2024 19:31:09 +0100 Subject: [PATCH 9/9] remove redundant information --- src/poetry/inspection/lazy_wheel.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py index afc0ea570ac..885ea7166e5 100644 --- a/src/poetry/inspection/lazy_wheel.py +++ b/src/poetry/inspection/lazy_wheel.py @@ -106,10 +106,10 @@ def _merge( """Return an iterator of intervals to be fetched. Args: - start (int): Start of needed interval - end (int): End of needed interval - left (int): Index of first overlapping downloaded data - right (int): Index after last overlapping downloaded data + start: Start of needed interval + end: End of needed interval + left: Index of first overlapping downloaded data + right: Index after last overlapping downloaded data """ lslice, rslice = self._left[left:right], self._right[left:right] i = start = min([start] + lslice[:1])