From 6f167b55a9293b423244318f1ae77fe576f2ee43 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Wed, 1 Jun 2022 02:06:40 -0400 Subject: [PATCH 01/21] PoC of PEP 691 --- src/pip/_internal/index/collector.py | 154 +++++++++++++++------- src/pip/_internal/index/package_finder.py | 6 +- 2 files changed, 106 insertions(+), 54 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index c79e2410c80..8f70e713e59 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -6,6 +6,7 @@ import email.message import functools import itertools +import json import logging import os import re @@ -65,32 +66,44 @@ def _match_vcs_scheme(url: str) -> Optional[str]: return None -class _NotHTML(Exception): +class _NotAPIContent(Exception): def __init__(self, content_type: str, request_desc: str) -> None: super().__init__(content_type, request_desc) self.content_type = content_type self.request_desc = request_desc -def _ensure_html_header(response: Response) -> None: - """Check the Content-Type header to ensure the response contains HTML. +def _ensure_api_header(response: Response) -> None: + """ + Check the Content-Type header to ensure the response contains a Simple + API Response. - Raises `_NotHTML` if the content type is not text/html. + Raises `_NotAPIContent` if the content type is not a valid content-type. """ content_type = response.headers.get("Content-Type", "") - if not content_type.lower().startswith("text/html"): - raise _NotHTML(content_type, response.request.method) + + content_type_l = content_type.lower() + if content_type_l.startswith("text/html"): + return + elif content_type_l.startswith("application/vnd.pypi.simple.v1+html"): + return + elif content_type_l.startswith("application/vnd.pypi.simple.v1+json"): + return + + raise _NotAPIContent(content_type, response.request.method) class _NotHTTP(Exception): pass -def _ensure_html_response(url: str, session: PipSession) -> None: - """Send a HEAD request to the URL, and ensure the response contains HTML. +def _ensure_api_response(url: str, session: PipSession) -> None: + """ + Send a HEAD request to the URL, and ensure the response contains a simple + API Response. Raises `_NotHTTP` if the URL is not available for a HEAD request, or - `_NotHTML` if the content type is not text/html. + `_NotAPIContent` if the content type is not a valid content type. """ scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url) if scheme not in {"http", "https"}: @@ -99,31 +112,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None: resp = session.head(url, allow_redirects=True) raise_for_status(resp) - _ensure_html_header(resp) + _ensure_api_header(resp) -def _get_html_response(url: str, session: PipSession) -> Response: - """Access an HTML page with GET, and return the response. +def _get_simple_response(url: str, session: PipSession) -> Response: + """Access an Simple API response with GET, and return the response. This consists of three parts: 1. If the URL looks suspiciously like an archive, send a HEAD first to - check the Content-Type is HTML, to avoid downloading a large file. - Raise `_NotHTTP` if the content type cannot be determined, or - `_NotHTML` if it is not HTML. + check the Content-Type is HTML or Simple API, to avoid downloading a + large file. Raise `_NotHTTP` if the content type cannot be determined, or + `_NotAPIContent` if it is not HTML or a Simple API. 2. Actually perform the request. Raise HTTP exceptions on network failures. - 3. Check the Content-Type header to make sure we got HTML, and raise - `_NotHTML` otherwise. + 3. Check the Content-Type header to make sure we got a Simple API response, + and raise `_NotAPIContent` otherwise. """ if is_archive_file(Link(url).filename): - _ensure_html_response(url, session=session) + _ensure_api_response(url, session=session) logger.debug("Getting page %s", redact_auth_from_url(url)) resp = session.get( url, headers={ - "Accept": "text/html", + "Accept": ", ".join( + [ + "application/vnd.pypi.simple.v1+json", + "application/vnd.pypi.simple.v1+html; q=0.2", + "text/html; q=0.1", + ] + ), # We don't want to blindly returned cached data for # /simple/, because authors generally expecting that # twine upload && pip install will function, but if @@ -145,9 +164,10 @@ def _get_html_response(url: str, session: PipSession) -> Response: # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every - # url we cannot know ahead of time for sure if something is HTML - # or not. However we can check after we've downloaded it. - _ensure_html_header(resp) + # url we cannot know ahead of time for sure if something is a + # Simple API response or not. However we can check after we've + # downloaded it. + _ensure_api_header(resp) return resp @@ -273,7 +293,7 @@ def _create_link_from_element( class CacheablePageContent: - def __init__(self, page: "HTMLPage") -> None: + def __init__(self, page: "IndexContent") -> None: assert page.cache_link_parsing self.page = page @@ -286,15 +306,15 @@ def __hash__(self) -> int: class ParseLinks(Protocol): def __call__( - self, page: "HTMLPage", use_deprecated_html5lib: bool + self, page: "IndexContent", use_deprecated_html5lib: bool ) -> Iterable[Link]: ... -def with_cached_html_pages(fn: ParseLinks) -> ParseLinks: +def with_cached_index_content(fn: ParseLinks) -> ParseLinks: """ - Given a function that parses an Iterable[Link] from an HTMLPage, cache the - function's result (keyed by CacheablePageContent), unless the HTMLPage + Given a function that parses an Iterable[Link] from an IndexContent, cache the + function's result (keyed by CacheablePageContent), unless the IndexContent `page` has `page.cache_link_parsing == False`. """ @@ -305,7 +325,9 @@ def wrapper( return list(fn(cacheable_page.page, use_deprecated_html5lib)) @functools.wraps(fn) - def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]: + def wrapper_wrapper( + page: "IndexContent", use_deprecated_html5lib: bool + ) -> List[Link]: if page.cache_link_parsing: return wrapper(CacheablePageContent(page), use_deprecated_html5lib) return list(fn(page, use_deprecated_html5lib)) @@ -313,7 +335,7 @@ def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Lin return wrapper_wrapper -def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]: +def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]: """ Parse an HTML document, and yield its anchor elements as Link objects. @@ -338,12 +360,35 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]: yield link -@with_cached_html_pages -def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]: +@with_cached_index_content +def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]: """ - Parse an HTML document, and yield its anchor elements as Link objects. + Parse a Simple API's Index Content, and yield its anchor elements as Link objects. """ + content_type_l = page.content_type.lower() + if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): + data = json.loads(page.content) + for file in data.get("files", []): + file_url = file.get("url") + if file_url is None: + continue + + # The Link.yanked_reason expects an empty string instead of a boolean. + yanked_reason = file.get("yanked") + if yanked_reason and not isinstance(yanked_reason, str): + yanked_reason = "" + # The Link.yanked_reason expects None instead of False + elif not yanked_reason: + yanked_reason = None + + yield Link( + _clean_link(urllib.parse.urljoin(page.url, file_url)), + comes_from=page.url, + requires_python=file.get("requires-python"), + yanked_reason=yanked_reason, + ) + if use_deprecated_html5lib: yield from _parse_links_html5lib(page) return @@ -365,12 +410,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin yield link -class HTMLPage: - """Represents one page, along with its URL""" +class IndexContent: + """Represents one response (or page), along with its URL""" def __init__( self, content: bytes, + content_type: str, encoding: Optional[str], url: str, cache_link_parsing: bool = True, @@ -383,6 +429,7 @@ def __init__( have this set to False, for example. """ self.content = content + self.content_type = content_type self.encoding = encoding self.url = url self.cache_link_parsing = cache_link_parsing @@ -419,7 +466,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: return None -def _handle_get_page_fail( +def _handle_get_simple_fail( link: Link, reason: Union[str, Exception], meth: Optional[Callable[..., None]] = None, @@ -429,19 +476,22 @@ def _handle_get_page_fail( meth("Could not fetch URL %s: %s - skipping", link, reason) -def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage: +def _make_index_content( + response: Response, cache_link_parsing: bool = True +) -> IndexContent: encoding = _get_encoding_from_headers(response.headers) - return HTMLPage( + return IndexContent( response.content, + response.headers["Content-Type"], encoding=encoding, url=response.url, cache_link_parsing=cache_link_parsing, ) -def _get_html_page( +def _get_index_content( link: Link, session: Optional[PipSession] = None -) -> Optional["HTMLPage"]: +) -> Optional["IndexContent"]: if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'" @@ -468,37 +518,39 @@ def _get_html_page( url += "/" url = urllib.parse.urljoin(url, "index.html") logger.debug(" file: URL is directory, getting %s", url) + # TODO: index.json? try: - resp = _get_html_response(url, session=session) + resp = _get_simple_response(url, session=session) except _NotHTTP: logger.warning( "Skipping page %s because it looks like an archive, and cannot " "be checked by a HTTP HEAD request.", link, ) - except _NotHTML as exc: + except _NotAPIContent as exc: logger.warning( - "Skipping page %s because the %s request got Content-Type: %s." - "The only supported Content-Type is text/html", + "Skipping page %s because the %s request got Content-Type: %s. " + "The only supported Content-Types are application/vnd.pypi.simple.v1+json, " + "application/vnd.pypi.simple.v1+html, and text/html", link, exc.request_desc, exc.content_type, ) except NetworkConnectionError as exc: - _handle_get_page_fail(link, exc) + _handle_get_simple_fail(link, exc) except RetryError as exc: - _handle_get_page_fail(link, exc) + _handle_get_simple_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) - _handle_get_page_fail(link, reason, meth=logger.info) + _handle_get_simple_fail(link, reason, meth=logger.info) except requests.ConnectionError as exc: - _handle_get_page_fail(link, f"connection error: {exc}") + _handle_get_simple_fail(link, f"connection error: {exc}") except requests.Timeout: - _handle_get_page_fail(link, "timed out") + _handle_get_simple_fail(link, "timed out") else: - return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing) + return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing) return None @@ -561,11 +613,11 @@ def create( def find_links(self) -> List[str]: return self.search_scope.find_links - def fetch_page(self, location: Link) -> Optional[HTMLPage]: + def fetch_response(self, location: Link) -> Optional[IndexContent]: """ Fetch an HTML page containing package links. """ - return _get_html_page(location, session=self.session) + return _get_index_content(location, session=self.session) def collect_sources( self, diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index f70f74b17c6..dbb6a64066c 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -792,11 +792,11 @@ def process_project_url( "Fetching project page and analyzing links: %s", project_url, ) - html_page = self._link_collector.fetch_page(project_url) - if html_page is None: + index_response = self._link_collector.fetch_response(project_url) + if index_response is None: return [] - page_links = list(parse_links(html_page, self._use_deprecated_html5lib)) + page_links = list(parse_links(index_response, self._use_deprecated_html5lib)) with indent_log(): package_links = self.evaluate_links( From 1fc571f3f69e9592c4092709aa0e329d9ed4b78d Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:15:28 -0400 Subject: [PATCH 02/21] Support Hashes from PEP691 --- src/pip/_internal/index/collector.py | 14 +++++++++++++- src/pip/_internal/models/link.py | 24 ++++++++++++------------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 8f70e713e59..3aaf5ceee4c 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -34,7 +34,7 @@ from pip._vendor.requests.exceptions import RetryError, SSLError from pip._internal.exceptions import NetworkConnectionError -from pip._internal.models.link import Link +from pip._internal.models.link import Link, SUPPORTED_HASHES from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession from pip._internal.network.utils import raise_for_status @@ -266,6 +266,11 @@ def _clean_link(url: str) -> str: return urllib.parse.urlunparse(result._replace(path=path)) +_HASH_RE = re.compile( + r"({choices})=([a-f0-9]+)".format(choices="|".join(SUPPORTED_HASHES)) +) + + def _create_link_from_element( element_attribs: Dict[str, Optional[str]], page_url: str, @@ -282,11 +287,17 @@ def _create_link_from_element( pyrequire = element_attribs.get("data-requires-python") yanked_reason = element_attribs.get("data-yanked") + hashes = {} + hm = _HASH_RE.search(url) + if hm is not None: + hashes[hm.group(1).lower()] = hm.group(2) + link = Link( url, comes_from=page_url, requires_python=pyrequire, yanked_reason=yanked_reason, + hashes=hashes, ) return link @@ -387,6 +398,7 @@ def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable comes_from=page.url, requires_python=file.get("requires-python"), yanked_reason=yanked_reason, + hashes=file.get("hashes", {}), ) if use_deprecated_html5lib: diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 6069b278b9b..238b2b423dc 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -22,7 +22,9 @@ logger = logging.getLogger(__name__) -_SUPPORTED_HASHES = ("sha1", "sha224", "sha384", "sha256", "sha512", "md5") +# Order matters, earlier hashes have a precedence over later hashes for what +# we will pick to use. +SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5") class Link(KeyBasedCompareMixin): @@ -44,6 +46,7 @@ def __init__( requires_python: Optional[str] = None, yanked_reason: Optional[str] = None, cache_link_parsing: bool = True, + hashes: Optional[dict[str, str]] = None, ) -> None: """ :param url: url of the resource pointed to (href of the link) @@ -74,6 +77,7 @@ def __init__( # Store the url as a private attribute to prevent accidentally # trying to set a new value. self._url = url + self._hashes = hashes if hashes is not None else {} self.comes_from = comes_from self.requires_python = requires_python if requires_python else None @@ -165,22 +169,18 @@ def subdirectory_fragment(self) -> Optional[str]: return None return match.group(1) - _hash_re = re.compile( - r"({choices})=([a-f0-9]+)".format(choices="|".join(_SUPPORTED_HASHES)) - ) - @property def hash(self) -> Optional[str]: - match = self._hash_re.search(self._url) - if match: - return match.group(2) + for hashname in SUPPORTED_HASHES: + if hashname in self._hashes: + return self._hashes[hashname] return None @property def hash_name(self) -> Optional[str]: - match = self._hash_re.search(self._url) - if match: - return match.group(1) + for hashname in SUPPORTED_HASHES: + if hashname in self._hashes: + return hashname return None @property @@ -274,7 +274,7 @@ def _clean_link(link: Link) -> _CleanResult: subdirectory = "" # If there are multiple hash values under the same algorithm, use the # first one. This matches the behavior of Link.hash_value. - hashes = {k: fragment[k][0] for k in _SUPPORTED_HASHES if k in fragment} + hashes = {k: fragment[k][0] for k in SUPPORTED_HASHES if k in fragment} return _CleanResult( parsed=parsed._replace(netloc=netloc, query="", fragment=""), query=urllib.parse.parse_qs(parsed.query), From a287a0bff360414c38bc0540d28653cd8ffe4ce1 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:17:04 -0400 Subject: [PATCH 03/21] Use a tuple and one call to startswith --- src/pip/_internal/index/collector.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 3aaf5ceee4c..2e219021548 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -83,11 +83,13 @@ def _ensure_api_header(response: Response) -> None: content_type = response.headers.get("Content-Type", "") content_type_l = content_type.lower() - if content_type_l.startswith("text/html"): - return - elif content_type_l.startswith("application/vnd.pypi.simple.v1+html"): - return - elif content_type_l.startswith("application/vnd.pypi.simple.v1+json"): + if content_type_l.startswith( + ( + "text/html", + "application/vnd.pypi.simple.v1+html", + "application/vnd.pypi.simple.v1+json", + ) + ): return raise _NotAPIContent(content_type, response.request.method) From 9c58d2d11bdc9bb4690378921c676c79e2b3ca26 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:17:15 -0400 Subject: [PATCH 04/21] Use smaller values for HTML --- src/pip/_internal/index/collector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 2e219021548..a2319f624ee 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -141,8 +141,8 @@ def _get_simple_response(url: str, session: PipSession) -> Response: "Accept": ", ".join( [ "application/vnd.pypi.simple.v1+json", - "application/vnd.pypi.simple.v1+html; q=0.2", - "text/html; q=0.1", + "application/vnd.pypi.simple.v1+html; q=0.1", + "text/html; q=0.01", ] ), # We don't want to blindly returned cached data for From 21fadc58584473f7113f24ce7dd19c95b3908f7c Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:21:29 -0400 Subject: [PATCH 05/21] fix typing --- src/pip/_internal/models/link.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 238b2b423dc..ea18e382244 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -17,7 +17,7 @@ from pip._internal.utils.urls import path_to_url, url_to_path if TYPE_CHECKING: - from pip._internal.index.collector import HTMLPage + from pip._internal.index.collector import IndexContent logger = logging.getLogger(__name__) @@ -33,6 +33,7 @@ class Link(KeyBasedCompareMixin): __slots__ = [ "_parsed_url", "_url", + "_hashes", "comes_from", "requires_python", "yanked_reason", @@ -42,7 +43,7 @@ class Link(KeyBasedCompareMixin): def __init__( self, url: str, - comes_from: Optional[Union[str, "HTMLPage"]] = None, + comes_from: Optional[Union[str, "IndexContent"]] = None, requires_python: Optional[str] = None, yanked_reason: Optional[str] = None, cache_link_parsing: bool = True, @@ -50,7 +51,7 @@ def __init__( ) -> None: """ :param url: url of the resource pointed to (href of the link) - :param comes_from: instance of HTMLPage where the link was found, + :param comes_from: instance of IndexContent where the link was found, or string. :param requires_python: String containing the `Requires-Python` metadata field, specified in PEP 345. This may be specified by From 55192a16747fbf73e37d213de11b228b24e4dee7 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:23:00 -0400 Subject: [PATCH 06/21] Fix tests --- tests/unit/test_collector.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 868a13b03b2..7c77a40dfb9 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -13,7 +13,7 @@ from pip._internal.exceptions import NetworkConnectionError from pip._internal.index.collector import ( - HTMLPage, + IndexContent, LinkCollector, _clean_link, _clean_url_path, @@ -426,7 +426,7 @@ def _test_parse_links_data_attribute( "{}" ).format(anchor_html) html_bytes = html.encode("utf-8") - page = HTMLPage( + page = IndexContent( html_bytes, encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure @@ -503,14 +503,14 @@ def test_parse_links_caches_same_page_by_url() -> None: url = "https://example.com/simple/" - page_1 = HTMLPage( + page_1 = IndexContent( html_bytes, encoding=None, url=url, ) # Make a second page with zero content, to ensure that it's not accessed, # because the page was cached by url. - page_2 = HTMLPage( + page_2 = IndexContent( b"", encoding=None, url=url, @@ -518,7 +518,7 @@ def test_parse_links_caches_same_page_by_url() -> None: # Make a third page which represents an index url, which should not be # cached, even for the same url. We modify the page content slightly to # verify that the result is not cached. - page_3 = HTMLPage( + page_3 = IndexContent( re.sub(b"pkg1", b"pkg2", html_bytes), encoding=None, url=url, @@ -541,7 +541,7 @@ def test_parse_links_caches_same_page_by_url() -> None: def test_parse_link_handles_deprecated_usage_properly() -> None: html = b'' url = "https://example.com/simple/" - page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) + page = IndexContent(html, encoding=None, url=url, cache_link_parsing=False) parsed_links = list(parse_links(page, use_deprecated_html5lib=True)) From 63bd865ef772fb9334b0029de04883b80f9bb06f Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:30:07 -0400 Subject: [PATCH 07/21] Add logging --- src/pip/_internal/index/collector.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index a2319f624ee..7bdbb928858 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -171,6 +171,12 @@ def _get_simple_response(url: str, session: PipSession) -> Response: # downloaded it. _ensure_api_header(resp) + logger.debug( + "Fetched page %s as %s", + redact_auth_from_url(url), + resp.headers.get("Content-Type", "Unknown"), + ) + return resp From 5600eea70bc25d7ab24bf59d358b45162b75728e Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:32:03 -0400 Subject: [PATCH 08/21] Fix Typing on older Pythons --- src/pip/_internal/models/link.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index ea18e382244..0c6cb5f74f2 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -4,7 +4,16 @@ import posixpath import re import urllib.parse -from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import ( + TYPE_CHECKING, + Dict, + List, + NamedTuple, + Optional, + Tuple, + Union, + Mapping, +) from pip._internal.utils.filetypes import WHEEL_EXTENSION from pip._internal.utils.hashes import Hashes @@ -47,7 +56,7 @@ def __init__( requires_python: Optional[str] = None, yanked_reason: Optional[str] = None, cache_link_parsing: bool = True, - hashes: Optional[dict[str, str]] = None, + hashes: Optional[Mapping[str, str]] = None, ) -> None: """ :param url: url of the resource pointed to (href of the link) From b64a067e5345452d29706be3d9afd0641fb11bba Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:33:08 -0400 Subject: [PATCH 09/21] Fix import ordering --- src/pip/_internal/index/collector.py | 2 +- src/pip/_internal/models/link.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 7bdbb928858..b896a57bfdf 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -34,7 +34,7 @@ from pip._vendor.requests.exceptions import RetryError, SSLError from pip._internal.exceptions import NetworkConnectionError -from pip._internal.models.link import Link, SUPPORTED_HASHES +from pip._internal.models.link import SUPPORTED_HASHES, Link from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession from pip._internal.network.utils import raise_for_status diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 0c6cb5f74f2..c44ae1be3f5 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -8,11 +8,11 @@ TYPE_CHECKING, Dict, List, + Mapping, NamedTuple, Optional, Tuple, Union, - Mapping, ) from pip._internal.utils.filetypes import WHEEL_EXTENSION From cdc2582ac93959984d500de61e6ba8485a2fad46 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:37:06 -0400 Subject: [PATCH 10/21] Add a news entry --- news/11158.feature.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 news/11158.feature.rst diff --git a/news/11158.feature.rst b/news/11158.feature.rst new file mode 100644 index 00000000000..74436d7dccf --- /dev/null +++ b/news/11158.feature.rst @@ -0,0 +1 @@ +Support `PEP 691 `_. From 21a99e4b87888bc1b14ad6f2f397bed80449fcf3 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:45:45 -0400 Subject: [PATCH 11/21] Update tests to use new names --- tests/unit/test_collector.py | 98 +++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 7c77a40dfb9..f357c588fe0 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -18,10 +18,10 @@ _clean_link, _clean_url_path, _determine_base_url, - _get_html_page, - _get_html_response, - _make_html_page, - _NotHTML, + _get_index_content, + _get_simple_response, + _make_index_content, + _NotAPIContent, _NotHTTP, parse_links, ) @@ -40,13 +40,13 @@ "file:///opt/data/pip-18.0.tar.gz", ], ) -def test_get_html_response_archive_to_naive_scheme(url: str) -> None: +def test_get_simple_response_archive_to_naive_scheme(url: str) -> None: """ - `_get_html_response()` should error on an archive-like URL if the scheme + `_get_simple_response()` should error on an archive-like URL if the scheme does not allow "poking" without getting data. """ with pytest.raises(_NotHTTP): - _get_html_response(url, session=mock.Mock(PipSession)) + _get_simple_response(url, session=mock.Mock(PipSession)) @pytest.mark.parametrize( @@ -57,12 +57,12 @@ def test_get_html_response_archive_to_naive_scheme(url: str) -> None: ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_archive_to_http_scheme( +def test_get_simple_response_archive_to_http_scheme( mock_raise_for_status: mock.Mock, url: str, content_type: str ) -> None: """ - `_get_html_response()` should send a HEAD request on an archive-like URL - if the scheme supports it, and raise `_NotHTML` if the response isn't HTML. + `_get_simple_response()` should send a HEAD request on an archive-like URL + if the scheme supports it, and raise `_NotAPIContent` if the response isn't HTML. """ session = mock.Mock(PipSession) session.head.return_value = mock.Mock( @@ -72,8 +72,8 @@ def test_get_html_response_archive_to_http_scheme( } ) - with pytest.raises(_NotHTML) as ctx: - _get_html_response(url, session=session) + with pytest.raises(_NotAPIContent) as ctx: + _get_simple_response(url, session=session) session.assert_has_calls( [ @@ -91,10 +91,10 @@ def test_get_html_response_archive_to_http_scheme( ("file:///opt/data/pip-18.0.tar.gz"), ], ) -def test_get_html_page_invalid_content_type_archive( +def test_get_index_content_invalid_content_type_archive( caplog: pytest.LogCaptureFixture, url: str ) -> None: - """`_get_html_page()` should warn if an archive URL is not HTML + """`_get_index_content()` should warn if an archive URL is not HTML and therefore cannot be used for a HEAD request. """ caplog.set_level(logging.WARNING) @@ -102,7 +102,7 @@ def test_get_html_page_invalid_content_type_archive( session = mock.Mock(PipSession) - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None assert ( "pip._internal.index.collector", logging.WARNING, @@ -119,11 +119,11 @@ def test_get_html_page_invalid_content_type_archive( ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_archive_to_http_scheme_is_html( +def test_get_simple_response_archive_to_http_scheme_is_html( mock_raise_for_status: mock.Mock, url: str ) -> None: """ - `_get_html_response()` should work with archive-like URLs if the HEAD + `_get_simple_response()` should work with archive-like URLs if the HEAD request is responded with text/html. """ session = mock.Mock(PipSession) @@ -135,7 +135,7 @@ def test_get_html_response_archive_to_http_scheme_is_html( ) session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"}) - resp = _get_html_response(url, session=session) + resp = _get_simple_response(url, session=session) assert resp is not None assert session.mock_calls == [ @@ -163,9 +163,11 @@ def test_get_html_response_archive_to_http_scheme_is_html( ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) -> None: +def test_get_simple_response_no_head( + mock_raise_for_status: mock.Mock, url: str +) -> None: """ - `_get_html_response()` shouldn't send a HEAD request if the URL does not + `_get_simple_response()` shouldn't send a HEAD request if the URL does not look like an archive, only the GET request that retrieves data. """ session = mock.Mock(PipSession) @@ -179,7 +181,7 @@ def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) - ) ) - resp = _get_html_response(url, session=session) + resp = _get_simple_response(url, session=session) assert resp is not None assert session.head.call_count == 0 @@ -197,11 +199,11 @@ def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) - @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_dont_log_clear_text_password( +def test_get_simple_response_dont_log_clear_text_password( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture ) -> None: """ - `_get_html_response()` should redact the password from the index URL + `_get_simple_response()` should redact the password from the index URL in its DEBUG log message. """ session = mock.Mock(PipSession) @@ -217,7 +219,7 @@ def test_get_html_response_dont_log_clear_text_password( caplog.set_level(logging.DEBUG) - resp = _get_html_response( + resp = _get_simple_response( "https://user:my_password@example.com/simple/", session=session ) @@ -428,6 +430,7 @@ def _test_parse_links_data_attribute( html_bytes = html.encode("utf-8") page = IndexContent( html_bytes, + "text/html", encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure # the page content isn't cached. @@ -505,6 +508,7 @@ def test_parse_links_caches_same_page_by_url() -> None: page_1 = IndexContent( html_bytes, + "text/html", encoding=None, url=url, ) @@ -512,6 +516,7 @@ def test_parse_links_caches_same_page_by_url() -> None: # because the page was cached by url. page_2 = IndexContent( b"", + "text/html", encoding=None, url=url, ) @@ -520,6 +525,7 @@ def test_parse_links_caches_same_page_by_url() -> None: # verify that the result is not cached. page_3 = IndexContent( re.sub(b"pkg1", b"pkg2", html_bytes), + "text/html", encoding=None, url=url, cache_link_parsing=False, @@ -541,7 +547,9 @@ def test_parse_links_caches_same_page_by_url() -> None: def test_parse_link_handles_deprecated_usage_properly() -> None: html = b'' url = "https://example.com/simple/" - page = IndexContent(html, encoding=None, url=url, cache_link_parsing=False) + page = IndexContent( + html, "text/html", encoding=None, url=url, cache_link_parsing=False + ) parsed_links = list(parse_links(page, use_deprecated_html5lib=True)) @@ -559,7 +567,7 @@ def test_request_http_error( session = mock.Mock(PipSession) session.get.return_value = mock.Mock() mock_raise_for_status.side_effect = NetworkConnectionError("Http error") - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None assert "Could not fetch URL http://localhost: Http error - skipping" in caplog.text @@ -568,11 +576,11 @@ def test_request_retries(caplog: pytest.LogCaptureFixture) -> None: link = Link("http://localhost") session = mock.Mock(PipSession) session.get.side_effect = requests.exceptions.RetryError("Retry error") - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None assert "Could not fetch URL http://localhost: Retry error - skipping" in caplog.text -def test_make_html_page() -> None: +def test_make_index_content() -> None: headers = {"Content-Type": "text/html; charset=UTF-8"} response = mock.Mock( content=b"", @@ -580,7 +588,7 @@ def test_make_html_page() -> None: headers=headers, ) - actual = _make_html_page(response) + actual = _make_index_content(response) assert actual.content == b"" assert actual.encoding == "UTF-8" assert actual.url == "https://example.com/index.html" @@ -593,15 +601,15 @@ def test_make_html_page() -> None: ("git+https://github.com/pypa/pip.git", "git"), ], ) -def test_get_html_page_invalid_scheme( +def test_get_index_content_invalid_scheme( caplog: pytest.LogCaptureFixture, url: str, vcs_scheme: str ) -> None: - """`_get_html_page()` should error if an invalid scheme is given. + """`_get_index_content()` should error if an invalid scheme is given. Only file:, http:, https:, and ftp: are allowed. """ with caplog.at_level(logging.WARNING): - page = _get_html_page(Link(url), session=mock.Mock(PipSession)) + page = _get_index_content(Link(url), session=mock.Mock(PipSession)) assert page is None assert caplog.record_tuples == [ @@ -622,12 +630,12 @@ def test_get_html_page_invalid_scheme( ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_page_invalid_content_type( +def test_get_index_content_invalid_content_type( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture, content_type: str, ) -> None: - """`_get_html_page()` should warn if an invalid content-type is given. + """`_get_index_content()` should warn if an invalid content-type is given. Only text/html is allowed. """ caplog.set_level(logging.DEBUG) @@ -641,7 +649,7 @@ def test_get_html_page_invalid_content_type( "headers": {"Content-Type": content_type}, } ) - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None mock_raise_for_status.assert_called_once_with(session.get.return_value) assert ( "pip._internal.index.collector", @@ -667,8 +675,8 @@ def make_fake_html_response(url: str) -> mock.Mock: return mock.Mock(content=content, url=url, headers={}) -def test_get_html_page_directory_append_index(tmpdir: Path) -> None: - """`_get_html_page()` should append "index.html" to a directory URL.""" +def test_get_index_content_directory_append_index(tmpdir: Path) -> None: + """`_get_index_content()` should append "index.html" to a directory URL.""" dirpath = tmpdir / "something" dirpath.mkdir() dir_url = dirpath.as_uri() @@ -676,10 +684,10 @@ def test_get_html_page_directory_append_index(tmpdir: Path) -> None: session = mock.Mock(PipSession) fake_response = make_fake_html_response(expected_url) - mock_func = mock.patch("pip._internal.index.collector._get_html_response") + mock_func = mock.patch("pip._internal.index.collector._get_simple_response") with mock_func as mock_func: mock_func.return_value = fake_response - actual = _get_html_page(Link(dir_url), session=session) + actual = _get_index_content(Link(dir_url), session=session) assert mock_func.mock_calls == [ mock.call(expected_url, session=session), ], f"actual calls: {mock_func.mock_calls}" @@ -779,16 +787,16 @@ def check_links_include(links: List[Link], names: List[str]) -> None: class TestLinkCollector: - @mock.patch("pip._internal.index.collector._get_html_response") - def test_fetch_page(self, mock_get_html_response: mock.Mock) -> None: + @mock.patch("pip._internal.index.collector._get_simple_response") + def test_fetch_response(self, mock_get_simple_response: mock.Mock) -> None: url = "https://pypi.org/simple/twine/" fake_response = make_fake_html_response(url) - mock_get_html_response.return_value = fake_response + mock_get_simple_response.return_value = fake_response location = Link(url, cache_link_parsing=False) link_collector = make_test_link_collector() - actual = link_collector.fetch_page(location) + actual = link_collector.fetch_response(location) assert actual is not None assert actual.content == fake_response.content @@ -797,8 +805,8 @@ def test_fetch_page(self, mock_get_html_response: mock.Mock) -> None: assert actual.cache_link_parsing == location.cache_link_parsing # Also check that the right session object was passed to - # _get_html_response(). - mock_get_html_response.assert_called_once_with( + # _get_simple_response(). + mock_get_simple_response.assert_called_once_with( url, session=link_collector.session, ) From 9c98346df96f1fb93e36bc9bc4fa01c4d0566f04 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:54:02 -0400 Subject: [PATCH 12/21] Add new accept headers --- tests/unit/test_collector.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index f357c588fe0..93e0efbdf99 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -33,6 +33,15 @@ from tests.lib import TestData, make_test_link_collector +ACCEPT = ", ".join( + [ + "application/vnd.pypi.simple.v1+json", + "application/vnd.pypi.simple.v1+html; q=0.1", + "text/html; q=0.01", + ] +) + + @pytest.mark.parametrize( "url", [ @@ -143,7 +152,7 @@ def test_get_simple_response_archive_to_http_scheme_is_html( mock.call.get( url, headers={ - "Accept": "text/html", + "Accept": ACCEPT, "Cache-Control": "max-age=0", }, ), @@ -189,7 +198,7 @@ def test_get_simple_response_no_head( mock.call( url, headers={ - "Accept": "text/html", + "Accept": ACCEPT, "Cache-Control": "max-age=0", }, ), From 799c88f13da710eda20d3d0ad15e4614c26e0120 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 14:58:32 -0400 Subject: [PATCH 13/21] Update some tests to work again --- tests/unit/test_collector.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 93e0efbdf99..5ec1f93c424 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -235,12 +235,17 @@ def test_get_simple_response_dont_log_clear_text_password( assert resp is not None mock_raise_for_status.assert_called_once_with(resp) - assert len(caplog.records) == 1 + assert len(caplog.records) == 2 record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Getting page https://user:****@example.com/simple/", ] + record = caplog.records[1] + assert record.levelname == "DEBUG" + assert record.message.splitlines() == [ + "Fetched page https://user:****@example.com/simple/ as text/html", + ] @pytest.mark.parametrize( @@ -664,7 +669,8 @@ def test_get_index_content_invalid_content_type( "pip._internal.index.collector", logging.WARNING, "Skipping page {} because the GET request got Content-Type: {}." - "The only supported Content-Type is text/html".format(url, content_type), + "The only supported Content-Types are application/vnd.pypi.simple.v1+json, " + "application/vnd.pypi.simple.v1+html, and text/html'".format(url, content_type), ) in caplog.record_tuples @@ -681,7 +687,7 @@ def make_fake_html_response(url: str) -> mock.Mock: """ ) content = html.encode("utf-8") - return mock.Mock(content=content, url=url, headers={}) + return mock.Mock(content=content, url=url, headers={"Content-Type": "text/html"}) def test_get_index_content_directory_append_index(tmpdir: Path) -> None: From d67cac3c1337ebd6b64d58e3365a04c9e392b4f9 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 15:06:08 -0400 Subject: [PATCH 14/21] formatting --- tests/unit/test_collector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 5ec1f93c424..93a074409b8 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -32,7 +32,6 @@ from pip._internal.network.session import PipSession from tests.lib import TestData, make_test_link_collector - ACCEPT = ", ".join( [ "application/vnd.pypi.simple.v1+json", From 6f9ccfc1367a2ef043e05586c4a95845c7ea2566 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sat, 25 Jun 2022 15:37:43 -0400 Subject: [PATCH 15/21] Fix link hashes --- src/pip/_internal/index/collector.py | 13 +------------ src/pip/_internal/models/link.py | 22 ++++++++++++++++++---- tests/unit/test_collector.py | 4 ++-- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index b896a57bfdf..956ac34353a 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -34,7 +34,7 @@ from pip._vendor.requests.exceptions import RetryError, SSLError from pip._internal.exceptions import NetworkConnectionError -from pip._internal.models.link import SUPPORTED_HASHES, Link +from pip._internal.models.link import Link from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession from pip._internal.network.utils import raise_for_status @@ -274,11 +274,6 @@ def _clean_link(url: str) -> str: return urllib.parse.urlunparse(result._replace(path=path)) -_HASH_RE = re.compile( - r"({choices})=([a-f0-9]+)".format(choices="|".join(SUPPORTED_HASHES)) -) - - def _create_link_from_element( element_attribs: Dict[str, Optional[str]], page_url: str, @@ -295,17 +290,11 @@ def _create_link_from_element( pyrequire = element_attribs.get("data-requires-python") yanked_reason = element_attribs.get("data-yanked") - hashes = {} - hm = _HASH_RE.search(url) - if hm is not None: - hashes[hm.group(1).lower()] = hm.group(2) - link = Link( url, comes_from=page_url, requires_python=pyrequire, yanked_reason=yanked_reason, - hashes=hashes, ) return link diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index c44ae1be3f5..655a4072d18 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -33,7 +33,7 @@ # Order matters, earlier hashes have a precedence over later hashes for what # we will pick to use. -SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5") +_SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5") class Link(KeyBasedCompareMixin): @@ -179,18 +179,32 @@ def subdirectory_fragment(self) -> Optional[str]: return None return match.group(1) + _hash_re = re.compile( + r"({choices})=([a-f0-9]+)".format(choices="|".join(_SUPPORTED_HASHES)) + ) + @property def hash(self) -> Optional[str]: - for hashname in SUPPORTED_HASHES: + for hashname in _SUPPORTED_HASHES: if hashname in self._hashes: return self._hashes[hashname] + + match = self._hash_re.search(self._url) + if match: + return match.group(2) + return None @property def hash_name(self) -> Optional[str]: - for hashname in SUPPORTED_HASHES: + for hashname in _SUPPORTED_HASHES: if hashname in self._hashes: return hashname + + match = self._hash_re.search(self._url) + if match: + return match.group(1) + return None @property @@ -284,7 +298,7 @@ def _clean_link(link: Link) -> _CleanResult: subdirectory = "" # If there are multiple hash values under the same algorithm, use the # first one. This matches the behavior of Link.hash_value. - hashes = {k: fragment[k][0] for k in SUPPORTED_HASHES if k in fragment} + hashes = {k: fragment[k][0] for k in _SUPPORTED_HASHES if k in fragment} return _CleanResult( parsed=parsed._replace(netloc=netloc, query="", fragment=""), query=urllib.parse.parse_qs(parsed.query), diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 93a074409b8..240b53d04b9 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -201,7 +201,7 @@ def test_get_simple_response_no_head( "Cache-Control": "max-age=0", }, ), - mock.call().headers.get("Content-Type", ""), + mock.call().headers.get("Content-Type", "Unknown"), ] mock_raise_for_status.assert_called_once_with(resp) @@ -667,7 +667,7 @@ def test_get_index_content_invalid_content_type( assert ( "pip._internal.index.collector", logging.WARNING, - "Skipping page {} because the GET request got Content-Type: {}." + "Skipping page {} because the GET request got Content-Type: {}. " "The only supported Content-Types are application/vnd.pypi.simple.v1+json, " "application/vnd.pypi.simple.v1+html, and text/html'".format(url, content_type), ) in caplog.record_tuples From 7cb5aae8809ee45e3d3f2a0579db73b67717a86b Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sun, 26 Jun 2022 02:28:31 -0400 Subject: [PATCH 16/21] fix a stray ' --- tests/unit/test_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 240b53d04b9..381cde6eaa8 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -669,7 +669,7 @@ def test_get_index_content_invalid_content_type( logging.WARNING, "Skipping page {} because the GET request got Content-Type: {}. " "The only supported Content-Types are application/vnd.pypi.simple.v1+json, " - "application/vnd.pypi.simple.v1+html, and text/html'".format(url, content_type), + "application/vnd.pypi.simple.v1+html, and text/html".format(url, content_type), ) in caplog.record_tuples From 5f3fbf040ca58b9794c3f701665cc10b1495ba37 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sun, 26 Jun 2022 02:30:25 -0400 Subject: [PATCH 17/21] Always use Unknown when unknown --- src/pip/_internal/index/collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 956ac34353a..0d3d087fd84 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -80,7 +80,7 @@ def _ensure_api_header(response: Response) -> None: Raises `_NotAPIContent` if the content type is not a valid content-type. """ - content_type = response.headers.get("Content-Type", "") + content_type = response.headers.get("Content-Type", "Unknown") content_type_l = content_type.lower() if content_type_l.startswith( From 68836d479fc3d4626a8827ef8672df4a34d52baa Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Sun, 26 Jun 2022 02:36:36 -0400 Subject: [PATCH 18/21] fix number of calls --- tests/unit/test_collector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 381cde6eaa8..e49c47c54b2 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -202,6 +202,7 @@ def test_get_simple_response_no_head( }, ), mock.call().headers.get("Content-Type", "Unknown"), + mock.call().headers.get("Content-Type", "Unknown"), ] mock_raise_for_status.assert_called_once_with(resp) From 41051f5f7bd084b8aea2a0d42563f5470f10ffb6 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Fri, 15 Jul 2022 17:40:52 -0400 Subject: [PATCH 19/21] Better expand on our TODO --- src/pip/_internal/index/collector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 0d3d087fd84..04646ae1121 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -525,9 +525,12 @@ def _get_index_content( # final segment if not url.endswith("/"): url += "/" + # TODO: In the future, it would be nice if pip supported PEP 691 + # style respones in the file:// URLs, however there's no + # standard file extension for application/vnd.pypi.simple.v1+json + # so we'll need to come up with something on our own. url = urllib.parse.urljoin(url, "index.html") logger.debug(" file: URL is directory, getting %s", url) - # TODO: index.json? try: resp = _get_simple_response(url, session=session) From c1b46c19cfb11e2b5b444fc0da0b8d19b9d1df1a Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Fri, 15 Jul 2022 17:43:17 -0400 Subject: [PATCH 20/21] Document the hashes parameter --- src/pip/_internal/models/link.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 655a4072d18..8fd1c3d9960 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -77,6 +77,8 @@ def __init__( should be cached. PyPI index urls should generally have this set to False, for example. + :param hashes: A mapping of hash names to digests to allow us to + determine the validity of a download. """ # url can be a UNC windows share From c1b50e1f4dcfe7803e0a4bd12c6ee01873916bd0 Mon Sep 17 00:00:00 2001 From: Donald Stufft Date: Fri, 15 Jul 2022 17:55:15 -0400 Subject: [PATCH 21/21] Add a test for parsing links from JSON --- tests/unit/test_collector.py | 52 ++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index e49c47c54b2..eff2594cad9 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -1,4 +1,5 @@ import itertools +import json import logging import os import re @@ -481,6 +482,57 @@ def test_parse_links__requires_python( _test_parse_links_data_attribute(anchor_html, "requires_python", expected) +def test_parse_links_json() -> None: + json_bytes = json.dumps( + { + "meta": {"api-version": "1.0"}, + "name": "holygrail", + "files": [ + { + "filename": "holygrail-1.0.tar.gz", + "url": "https://example.com/files/holygrail-1.0.tar.gz", + "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + "requires-python": ">=3.7", + "yanked": "Had a vulnerability", + }, + { + "filename": "holygrail-1.0-py3-none-any.whl", + "url": "/files/holygrail-1.0-py3-none-any.whl", + "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + "requires-python": ">=3.7", + "dist-info-metadata": False, + }, + ], + } + ).encode("utf8") + page = IndexContent( + json_bytes, + "application/vnd.pypi.simple.v1+json", + encoding=None, + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. + url=f"https://example.com/simple-{uuid.uuid4()}/", + ) + links = list(parse_links(page, use_deprecated_html5lib=False)) + + assert links == [ + Link( + "https://example.com/files/holygrail-1.0.tar.gz", + comes_from=page.url, + requires_python=">=3.7", + yanked_reason="Had a vulnerability", + hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + ), + Link( + "https://example.com/files/holygrail-1.0-py3-none-any.whl", + comes_from=page.url, + requires_python=">=3.7", + yanked_reason=None, + hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + ), + ] + + @pytest.mark.parametrize( "anchor_html, expected", [