From 6f167b55a9293b423244318f1ae77fe576f2ee43 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Wed, 1 Jun 2022 02:06:40 -0400
Subject: [PATCH 01/21] PoC of PEP 691

---
 src/pip/_internal/index/collector.py      | 154 +++++++++++++++-------
 src/pip/_internal/index/package_finder.py |   6 +-
 2 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index c79e2410c80..8f70e713e59 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -6,6 +6,7 @@
 import email.message
 import functools
 import itertools
+import json
 import logging
 import os
 import re
@@ -65,32 +66,44 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
     return None
 
 
-class _NotHTML(Exception):
+class _NotAPIContent(Exception):
     def __init__(self, content_type: str, request_desc: str) -> None:
         super().__init__(content_type, request_desc)
         self.content_type = content_type
         self.request_desc = request_desc
 
 
-def _ensure_html_header(response: Response) -> None:
-    """Check the Content-Type header to ensure the response contains HTML.
+def _ensure_api_header(response: Response) -> None:
+    """
+    Check the Content-Type header to ensure the response contains a Simple
+    API Response.
 
-    Raises `_NotHTML` if the content type is not text/html.
+    Raises `_NotAPIContent` if the content type is not a valid content-type.
     """
     content_type = response.headers.get("Content-Type", "")
-    if not content_type.lower().startswith("text/html"):
-        raise _NotHTML(content_type, response.request.method)
+
+    content_type_l = content_type.lower()
+    if content_type_l.startswith("text/html"):
+        return
+    elif content_type_l.startswith("application/vnd.pypi.simple.v1+html"):
+        return
+    elif content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
+        return
+
+    raise _NotAPIContent(content_type, response.request.method)
 
 
 class _NotHTTP(Exception):
     pass
 
 
-def _ensure_html_response(url: str, session: PipSession) -> None:
-    """Send a HEAD request to the URL, and ensure the response contains HTML.
+def _ensure_api_response(url: str, session: PipSession) -> None:
+    """
+    Send a HEAD request to the URL, and ensure the response contains a simple
+    API Response.
 
     Raises `_NotHTTP` if the URL is not available for a HEAD request, or
-    `_NotHTML` if the content type is not text/html.
+    `_NotAPIContent` if the content type is not a valid content type.
     """
     scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
     if scheme not in {"http", "https"}:
@@ -99,31 +112,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
     resp = session.head(url, allow_redirects=True)
     raise_for_status(resp)
 
-    _ensure_html_header(resp)
+    _ensure_api_header(resp)
 
 
-def _get_html_response(url: str, session: PipSession) -> Response:
-    """Access an HTML page with GET, and return the response.
+def _get_simple_response(url: str, session: PipSession) -> Response:
+    """Access an Simple API response with GET, and return the response.
 
     This consists of three parts:
 
     1. If the URL looks suspiciously like an archive, send a HEAD first to
-       check the Content-Type is HTML, to avoid downloading a large file.
-       Raise `_NotHTTP` if the content type cannot be determined, or
-       `_NotHTML` if it is not HTML.
+       check the Content-Type is HTML or Simple API, to avoid downloading a
+       large file. Raise `_NotHTTP` if the content type cannot be determined, or
+       `_NotAPIContent` if it is not HTML or a Simple API.
     2. Actually perform the request. Raise HTTP exceptions on network failures.
-    3. Check the Content-Type header to make sure we got HTML, and raise
-       `_NotHTML` otherwise.
+    3. Check the Content-Type header to make sure we got a Simple API response,
+       and raise `_NotAPIContent` otherwise.
     """
     if is_archive_file(Link(url).filename):
-        _ensure_html_response(url, session=session)
+        _ensure_api_response(url, session=session)
 
     logger.debug("Getting page %s", redact_auth_from_url(url))
 
     resp = session.get(
         url,
         headers={
-            "Accept": "text/html",
+            "Accept": ", ".join(
+                [
+                    "application/vnd.pypi.simple.v1+json",
+                    "application/vnd.pypi.simple.v1+html; q=0.2",
+                    "text/html; q=0.1",
+                ]
+            ),
             # We don't want to blindly returned cached data for
             # /simple/, because authors generally expecting that
             # twine upload && pip install will function, but if
@@ -145,9 +164,10 @@ def _get_html_response(url: str, session: PipSession) -> Response:
     # The check for archives above only works if the url ends with
     # something that looks like an archive. However that is not a
     # requirement of an url. Unless we issue a HEAD request on every
-    # url we cannot know ahead of time for sure if something is HTML
-    # or not. However we can check after we've downloaded it.
-    _ensure_html_header(resp)
+    # url we cannot know ahead of time for sure if something is a
+    # Simple API response or not. However we can check after we've
+    # downloaded it.
+    _ensure_api_header(resp)
 
     return resp
 
@@ -273,7 +293,7 @@ def _create_link_from_element(
 
 
 class CacheablePageContent:
-    def __init__(self, page: "HTMLPage") -> None:
+    def __init__(self, page: "IndexContent") -> None:
         assert page.cache_link_parsing
         self.page = page
 
@@ -286,15 +306,15 @@ def __hash__(self) -> int:
 
 class ParseLinks(Protocol):
     def __call__(
-        self, page: "HTMLPage", use_deprecated_html5lib: bool
+        self, page: "IndexContent", use_deprecated_html5lib: bool
     ) -> Iterable[Link]:
         ...
 
 
-def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
+def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
     """
-    Given a function that parses an Iterable[Link] from an HTMLPage, cache the
-    function's result (keyed by CacheablePageContent), unless the HTMLPage
+    Given a function that parses an Iterable[Link] from an IndexContent, cache the
+    function's result (keyed by CacheablePageContent), unless the IndexContent
     `page` has `page.cache_link_parsing == False`.
     """
 
@@ -305,7 +325,9 @@ def wrapper(
         return list(fn(cacheable_page.page, use_deprecated_html5lib))
 
     @functools.wraps(fn)
-    def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
+    def wrapper_wrapper(
+        page: "IndexContent", use_deprecated_html5lib: bool
+    ) -> List[Link]:
         if page.cache_link_parsing:
             return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
         return list(fn(page, use_deprecated_html5lib))
@@ -313,7 +335,7 @@ def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Lin
     return wrapper_wrapper
 
 
-def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
+def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]:
     """
     Parse an HTML document, and yield its anchor elements as Link objects.
 
@@ -338,12 +360,35 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
         yield link
 
 
-@with_cached_html_pages
-def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
+@with_cached_index_content
+def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]:
     """
-    Parse an HTML document, and yield its anchor elements as Link objects.
+    Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
     """
 
+    content_type_l = page.content_type.lower()
+    if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
+        data = json.loads(page.content)
+        for file in data.get("files", []):
+            file_url = file.get("url")
+            if file_url is None:
+                continue
+
+            # The Link.yanked_reason expects an empty string instead of a boolean.
+            yanked_reason = file.get("yanked")
+            if yanked_reason and not isinstance(yanked_reason, str):
+                yanked_reason = ""
+            # The Link.yanked_reason expects None instead of False
+            elif not yanked_reason:
+                yanked_reason = None
+
+            yield Link(
+                _clean_link(urllib.parse.urljoin(page.url, file_url)),
+                comes_from=page.url,
+                requires_python=file.get("requires-python"),
+                yanked_reason=yanked_reason,
+            )
+
     if use_deprecated_html5lib:
         yield from _parse_links_html5lib(page)
         return
@@ -365,12 +410,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
         yield link
 
 
-class HTMLPage:
-    """Represents one page, along with its URL"""
+class IndexContent:
+    """Represents one response (or page), along with its URL"""
 
     def __init__(
         self,
         content: bytes,
+        content_type: str,
         encoding: Optional[str],
         url: str,
         cache_link_parsing: bool = True,
@@ -383,6 +429,7 @@ def __init__(
                                    have this set to False, for example.
         """
         self.content = content
+        self.content_type = content_type
         self.encoding = encoding
         self.url = url
         self.cache_link_parsing = cache_link_parsing
@@ -419,7 +466,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
         return None
 
 
-def _handle_get_page_fail(
+def _handle_get_simple_fail(
     link: Link,
     reason: Union[str, Exception],
     meth: Optional[Callable[..., None]] = None,
@@ -429,19 +476,22 @@ def _handle_get_page_fail(
     meth("Could not fetch URL %s: %s - skipping", link, reason)
 
 
-def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
+def _make_index_content(
+    response: Response, cache_link_parsing: bool = True
+) -> IndexContent:
     encoding = _get_encoding_from_headers(response.headers)
-    return HTMLPage(
+    return IndexContent(
         response.content,
+        response.headers["Content-Type"],
         encoding=encoding,
         url=response.url,
         cache_link_parsing=cache_link_parsing,
     )
 
 
-def _get_html_page(
+def _get_index_content(
     link: Link, session: Optional[PipSession] = None
-) -> Optional["HTMLPage"]:
+) -> Optional["IndexContent"]:
     if session is None:
         raise TypeError(
             "_get_html_page() missing 1 required keyword argument: 'session'"
@@ -468,37 +518,39 @@ def _get_html_page(
             url += "/"
         url = urllib.parse.urljoin(url, "index.html")
         logger.debug(" file: URL is directory, getting %s", url)
+        # TODO: index.json?
 
     try:
-        resp = _get_html_response(url, session=session)
+        resp = _get_simple_response(url, session=session)
     except _NotHTTP:
         logger.warning(
             "Skipping page %s because it looks like an archive, and cannot "
             "be checked by a HTTP HEAD request.",
             link,
         )
-    except _NotHTML as exc:
+    except _NotAPIContent as exc:
         logger.warning(
-            "Skipping page %s because the %s request got Content-Type: %s."
-            "The only supported Content-Type is text/html",
+            "Skipping page %s because the %s request got Content-Type: %s. "
+            "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
+            "application/vnd.pypi.simple.v1+html, and text/html",
             link,
             exc.request_desc,
             exc.content_type,
         )
     except NetworkConnectionError as exc:
-        _handle_get_page_fail(link, exc)
+        _handle_get_simple_fail(link, exc)
     except RetryError as exc:
-        _handle_get_page_fail(link, exc)
+        _handle_get_simple_fail(link, exc)
     except SSLError as exc:
         reason = "There was a problem confirming the ssl certificate: "
         reason += str(exc)
-        _handle_get_page_fail(link, reason, meth=logger.info)
+        _handle_get_simple_fail(link, reason, meth=logger.info)
     except requests.ConnectionError as exc:
-        _handle_get_page_fail(link, f"connection error: {exc}")
+        _handle_get_simple_fail(link, f"connection error: {exc}")
     except requests.Timeout:
-        _handle_get_page_fail(link, "timed out")
+        _handle_get_simple_fail(link, "timed out")
     else:
-        return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing)
+        return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
     return None
 
 
@@ -561,11 +613,11 @@ def create(
     def find_links(self) -> List[str]:
         return self.search_scope.find_links
 
-    def fetch_page(self, location: Link) -> Optional[HTMLPage]:
+    def fetch_response(self, location: Link) -> Optional[IndexContent]:
         """
         Fetch an HTML page containing package links.
         """
-        return _get_html_page(location, session=self.session)
+        return _get_index_content(location, session=self.session)
 
     def collect_sources(
         self,
diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py
index f70f74b17c6..dbb6a64066c 100644
--- a/src/pip/_internal/index/package_finder.py
+++ b/src/pip/_internal/index/package_finder.py
@@ -792,11 +792,11 @@ def process_project_url(
             "Fetching project page and analyzing links: %s",
             project_url,
         )
-        html_page = self._link_collector.fetch_page(project_url)
-        if html_page is None:
+        index_response = self._link_collector.fetch_response(project_url)
+        if index_response is None:
             return []
 
-        page_links = list(parse_links(html_page, self._use_deprecated_html5lib))
+        page_links = list(parse_links(index_response, self._use_deprecated_html5lib))
 
         with indent_log():
             package_links = self.evaluate_links(

From 1fc571f3f69e9592c4092709aa0e329d9ed4b78d Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:15:28 -0400
Subject: [PATCH 02/21] Support Hashes from PEP691

---
 src/pip/_internal/index/collector.py | 14 +++++++++++++-
 src/pip/_internal/models/link.py     | 24 ++++++++++++------------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index 8f70e713e59..3aaf5ceee4c 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -34,7 +34,7 @@
 from pip._vendor.requests.exceptions import RetryError, SSLError
 
 from pip._internal.exceptions import NetworkConnectionError
-from pip._internal.models.link import Link
+from pip._internal.models.link import Link, SUPPORTED_HASHES
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
@@ -266,6 +266,11 @@ def _clean_link(url: str) -> str:
     return urllib.parse.urlunparse(result._replace(path=path))
 
 
+_HASH_RE = re.compile(
+    r"({choices})=([a-f0-9]+)".format(choices="|".join(SUPPORTED_HASHES))
+)
+
+
 def _create_link_from_element(
     element_attribs: Dict[str, Optional[str]],
     page_url: str,
@@ -282,11 +287,17 @@ def _create_link_from_element(
     pyrequire = element_attribs.get("data-requires-python")
     yanked_reason = element_attribs.get("data-yanked")
 
+    hashes = {}
+    hm = _HASH_RE.search(url)
+    if hm is not None:
+        hashes[hm.group(1).lower()] = hm.group(2)
+
     link = Link(
         url,
         comes_from=page_url,
         requires_python=pyrequire,
         yanked_reason=yanked_reason,
+        hashes=hashes,
     )
 
     return link
@@ -387,6 +398,7 @@ def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable
                 comes_from=page.url,
                 requires_python=file.get("requires-python"),
                 yanked_reason=yanked_reason,
+                hashes=file.get("hashes", {}),
             )
 
     if use_deprecated_html5lib:
diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
index 6069b278b9b..238b2b423dc 100644
--- a/src/pip/_internal/models/link.py
+++ b/src/pip/_internal/models/link.py
@@ -22,7 +22,9 @@
 logger = logging.getLogger(__name__)
 
 
-_SUPPORTED_HASHES = ("sha1", "sha224", "sha384", "sha256", "sha512", "md5")
+# Order matters, earlier hashes have a precedence over later hashes for what
+# we will pick to use.
+SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5")
 
 
 class Link(KeyBasedCompareMixin):
@@ -44,6 +46,7 @@ def __init__(
         requires_python: Optional[str] = None,
         yanked_reason: Optional[str] = None,
         cache_link_parsing: bool = True,
+        hashes: Optional[dict[str, str]] = None,
     ) -> None:
         """
         :param url: url of the resource pointed to (href of the link)
@@ -74,6 +77,7 @@ def __init__(
         # Store the url as a private attribute to prevent accidentally
         # trying to set a new value.
         self._url = url
+        self._hashes = hashes if hashes is not None else {}
 
         self.comes_from = comes_from
         self.requires_python = requires_python if requires_python else None
@@ -165,22 +169,18 @@ def subdirectory_fragment(self) -> Optional[str]:
             return None
         return match.group(1)
 
-    _hash_re = re.compile(
-        r"({choices})=([a-f0-9]+)".format(choices="|".join(_SUPPORTED_HASHES))
-    )
-
     @property
     def hash(self) -> Optional[str]:
-        match = self._hash_re.search(self._url)
-        if match:
-            return match.group(2)
+        for hashname in SUPPORTED_HASHES:
+            if hashname in self._hashes:
+                return self._hashes[hashname]
         return None
 
     @property
     def hash_name(self) -> Optional[str]:
-        match = self._hash_re.search(self._url)
-        if match:
-            return match.group(1)
+        for hashname in SUPPORTED_HASHES:
+            if hashname in self._hashes:
+                return hashname
         return None
 
     @property
@@ -274,7 +274,7 @@ def _clean_link(link: Link) -> _CleanResult:
         subdirectory = ""
     # If there are multiple hash values under the same algorithm, use the
     # first one. This matches the behavior of Link.hash_value.
-    hashes = {k: fragment[k][0] for k in _SUPPORTED_HASHES if k in fragment}
+    hashes = {k: fragment[k][0] for k in SUPPORTED_HASHES if k in fragment}
     return _CleanResult(
         parsed=parsed._replace(netloc=netloc, query="", fragment=""),
         query=urllib.parse.parse_qs(parsed.query),

From a287a0bff360414c38bc0540d28653cd8ffe4ce1 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:17:04 -0400
Subject: [PATCH 03/21] Use a tuple and one call to startswith

---
 src/pip/_internal/index/collector.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index 3aaf5ceee4c..2e219021548 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -83,11 +83,13 @@ def _ensure_api_header(response: Response) -> None:
     content_type = response.headers.get("Content-Type", "")
 
     content_type_l = content_type.lower()
-    if content_type_l.startswith("text/html"):
-        return
-    elif content_type_l.startswith("application/vnd.pypi.simple.v1+html"):
-        return
-    elif content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
+    if content_type_l.startswith(
+        (
+            "text/html",
+            "application/vnd.pypi.simple.v1+html",
+            "application/vnd.pypi.simple.v1+json",
+        )
+    ):
         return
 
     raise _NotAPIContent(content_type, response.request.method)

From 9c58d2d11bdc9bb4690378921c676c79e2b3ca26 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:17:15 -0400
Subject: [PATCH 04/21] Use smaller values for HTML

---
 src/pip/_internal/index/collector.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index 2e219021548..a2319f624ee 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -141,8 +141,8 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
             "Accept": ", ".join(
                 [
                     "application/vnd.pypi.simple.v1+json",
-                    "application/vnd.pypi.simple.v1+html; q=0.2",
-                    "text/html; q=0.1",
+                    "application/vnd.pypi.simple.v1+html; q=0.1",
+                    "text/html; q=0.01",
                 ]
             ),
             # We don't want to blindly returned cached data for

From 21fadc58584473f7113f24ce7dd19c95b3908f7c Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:21:29 -0400
Subject: [PATCH 05/21] fix typing

---
 src/pip/_internal/models/link.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
index 238b2b423dc..ea18e382244 100644
--- a/src/pip/_internal/models/link.py
+++ b/src/pip/_internal/models/link.py
@@ -17,7 +17,7 @@
 from pip._internal.utils.urls import path_to_url, url_to_path
 
 if TYPE_CHECKING:
-    from pip._internal.index.collector import HTMLPage
+    from pip._internal.index.collector import IndexContent
 
 logger = logging.getLogger(__name__)
 
@@ -33,6 +33,7 @@ class Link(KeyBasedCompareMixin):
     __slots__ = [
         "_parsed_url",
         "_url",
+        "_hashes",
         "comes_from",
         "requires_python",
         "yanked_reason",
@@ -42,7 +43,7 @@ class Link(KeyBasedCompareMixin):
     def __init__(
         self,
         url: str,
-        comes_from: Optional[Union[str, "HTMLPage"]] = None,
+        comes_from: Optional[Union[str, "IndexContent"]] = None,
         requires_python: Optional[str] = None,
         yanked_reason: Optional[str] = None,
         cache_link_parsing: bool = True,
@@ -50,7 +51,7 @@ def __init__(
     ) -> None:
         """
         :param url: url of the resource pointed to (href of the link)
-        :param comes_from: instance of HTMLPage where the link was found,
+        :param comes_from: instance of IndexContent where the link was found,
             or string.
         :param requires_python: String containing the `Requires-Python`
             metadata field, specified in PEP 345. This may be specified by

From 55192a16747fbf73e37d213de11b228b24e4dee7 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:23:00 -0400
Subject: [PATCH 06/21] Fix tests

---
 tests/unit/test_collector.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index 868a13b03b2..7c77a40dfb9 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -13,7 +13,7 @@
 
 from pip._internal.exceptions import NetworkConnectionError
 from pip._internal.index.collector import (
-    HTMLPage,
+    IndexContent,
     LinkCollector,
     _clean_link,
     _clean_url_path,
@@ -426,7 +426,7 @@ def _test_parse_links_data_attribute(
         "<body>{}</body></html>"
     ).format(anchor_html)
     html_bytes = html.encode("utf-8")
-    page = HTMLPage(
+    page = IndexContent(
         html_bytes,
         encoding=None,
         # parse_links() is cached by url, so we inject a random uuid to ensure
@@ -503,14 +503,14 @@ def test_parse_links_caches_same_page_by_url() -> None:
 
     url = "https://example.com/simple/"
 
-    page_1 = HTMLPage(
+    page_1 = IndexContent(
         html_bytes,
         encoding=None,
         url=url,
     )
     # Make a second page with zero content, to ensure that it's not accessed,
     # because the page was cached by url.
-    page_2 = HTMLPage(
+    page_2 = IndexContent(
         b"",
         encoding=None,
         url=url,
@@ -518,7 +518,7 @@ def test_parse_links_caches_same_page_by_url() -> None:
     # Make a third page which represents an index url, which should not be
     # cached, even for the same url. We modify the page content slightly to
     # verify that the result is not cached.
-    page_3 = HTMLPage(
+    page_3 = IndexContent(
         re.sub(b"pkg1", b"pkg2", html_bytes),
         encoding=None,
         url=url,
@@ -541,7 +541,7 @@ def test_parse_links_caches_same_page_by_url() -> None:
 def test_parse_link_handles_deprecated_usage_properly() -> None:
     html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
     url = "https://example.com/simple/"
-    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)
+    page = IndexContent(html, encoding=None, url=url, cache_link_parsing=False)
 
     parsed_links = list(parse_links(page, use_deprecated_html5lib=True))
 

From 63bd865ef772fb9334b0029de04883b80f9bb06f Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:30:07 -0400
Subject: [PATCH 07/21] Add logging

---
 src/pip/_internal/index/collector.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index a2319f624ee..7bdbb928858 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -171,6 +171,12 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
     # downloaded it.
     _ensure_api_header(resp)
 
+    logger.debug(
+        "Fetched page %s as %s",
+        redact_auth_from_url(url),
+        resp.headers.get("Content-Type", "Unknown"),
+    )
+
     return resp
 
 

From 5600eea70bc25d7ab24bf59d358b45162b75728e Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:32:03 -0400
Subject: [PATCH 08/21] Fix Typing on older Pythons

---
 src/pip/_internal/models/link.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
index ea18e382244..0c6cb5f74f2 100644
--- a/src/pip/_internal/models/link.py
+++ b/src/pip/_internal/models/link.py
@@ -4,7 +4,16 @@
 import posixpath
 import re
 import urllib.parse
-from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+    Mapping,
+)
 
 from pip._internal.utils.filetypes import WHEEL_EXTENSION
 from pip._internal.utils.hashes import Hashes
@@ -47,7 +56,7 @@ def __init__(
         requires_python: Optional[str] = None,
         yanked_reason: Optional[str] = None,
         cache_link_parsing: bool = True,
-        hashes: Optional[dict[str, str]] = None,
+        hashes: Optional[Mapping[str, str]] = None,
     ) -> None:
         """
         :param url: url of the resource pointed to (href of the link)

From b64a067e5345452d29706be3d9afd0641fb11bba Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:33:08 -0400
Subject: [PATCH 09/21] Fix import ordering

---
 src/pip/_internal/index/collector.py | 2 +-
 src/pip/_internal/models/link.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index 7bdbb928858..b896a57bfdf 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -34,7 +34,7 @@
 from pip._vendor.requests.exceptions import RetryError, SSLError
 
 from pip._internal.exceptions import NetworkConnectionError
-from pip._internal.models.link import Link, SUPPORTED_HASHES
+from pip._internal.models.link import SUPPORTED_HASHES, Link
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
index 0c6cb5f74f2..c44ae1be3f5 100644
--- a/src/pip/_internal/models/link.py
+++ b/src/pip/_internal/models/link.py
@@ -8,11 +8,11 @@
     TYPE_CHECKING,
     Dict,
     List,
+    Mapping,
     NamedTuple,
     Optional,
     Tuple,
     Union,
-    Mapping,
 )
 
 from pip._internal.utils.filetypes import WHEEL_EXTENSION

From cdc2582ac93959984d500de61e6ba8485a2fad46 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:37:06 -0400
Subject: [PATCH 10/21] Add a news entry

---
 news/11158.feature.rst | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 news/11158.feature.rst

diff --git a/news/11158.feature.rst b/news/11158.feature.rst
new file mode 100644
index 00000000000..74436d7dccf
--- /dev/null
+++ b/news/11158.feature.rst
@@ -0,0 +1 @@
+Support `PEP 691 <https://peps.python.org/pep-0691/>`_.

From 21a99e4b87888bc1b14ad6f2f397bed80449fcf3 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:45:45 -0400
Subject: [PATCH 11/21] Update tests to use new names

---
 tests/unit/test_collector.py | 98 +++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 45 deletions(-)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index 7c77a40dfb9..f357c588fe0 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -18,10 +18,10 @@
     _clean_link,
     _clean_url_path,
     _determine_base_url,
-    _get_html_page,
-    _get_html_response,
-    _make_html_page,
-    _NotHTML,
+    _get_index_content,
+    _get_simple_response,
+    _make_index_content,
+    _NotAPIContent,
     _NotHTTP,
     parse_links,
 )
@@ -40,13 +40,13 @@
         "file:///opt/data/pip-18.0.tar.gz",
     ],
 )
-def test_get_html_response_archive_to_naive_scheme(url: str) -> None:
+def test_get_simple_response_archive_to_naive_scheme(url: str) -> None:
     """
-    `_get_html_response()` should error on an archive-like URL if the scheme
+    `_get_simple_response()` should error on an archive-like URL if the scheme
     does not allow "poking" without getting data.
     """
     with pytest.raises(_NotHTTP):
-        _get_html_response(url, session=mock.Mock(PipSession))
+        _get_simple_response(url, session=mock.Mock(PipSession))
 
 
 @pytest.mark.parametrize(
@@ -57,12 +57,12 @@ def test_get_html_response_archive_to_naive_scheme(url: str) -> None:
     ],
 )
 @mock.patch("pip._internal.index.collector.raise_for_status")
-def test_get_html_response_archive_to_http_scheme(
+def test_get_simple_response_archive_to_http_scheme(
     mock_raise_for_status: mock.Mock, url: str, content_type: str
 ) -> None:
     """
-    `_get_html_response()` should send a HEAD request on an archive-like URL
-    if the scheme supports it, and raise `_NotHTML` if the response isn't HTML.
+    `_get_simple_response()` should send a HEAD request on an archive-like URL
+    if the scheme supports it, and raise `_NotAPIContent` if the response isn't HTML.
     """
     session = mock.Mock(PipSession)
     session.head.return_value = mock.Mock(
@@ -72,8 +72,8 @@ def test_get_html_response_archive_to_http_scheme(
         }
     )
 
-    with pytest.raises(_NotHTML) as ctx:
-        _get_html_response(url, session=session)
+    with pytest.raises(_NotAPIContent) as ctx:
+        _get_simple_response(url, session=session)
 
     session.assert_has_calls(
         [
@@ -91,10 +91,10 @@ def test_get_html_response_archive_to_http_scheme(
         ("file:///opt/data/pip-18.0.tar.gz"),
     ],
 )
-def test_get_html_page_invalid_content_type_archive(
+def test_get_index_content_invalid_content_type_archive(
     caplog: pytest.LogCaptureFixture, url: str
 ) -> None:
-    """`_get_html_page()` should warn if an archive URL is not HTML
+    """`_get_index_content()` should warn if an archive URL is not HTML
     and therefore cannot be used for a HEAD request.
     """
     caplog.set_level(logging.WARNING)
@@ -102,7 +102,7 @@ def test_get_html_page_invalid_content_type_archive(
 
     session = mock.Mock(PipSession)
 
-    assert _get_html_page(link, session=session) is None
+    assert _get_index_content(link, session=session) is None
     assert (
         "pip._internal.index.collector",
         logging.WARNING,
@@ -119,11 +119,11 @@ def test_get_html_page_invalid_content_type_archive(
     ],
 )
 @mock.patch("pip._internal.index.collector.raise_for_status")
-def test_get_html_response_archive_to_http_scheme_is_html(
+def test_get_simple_response_archive_to_http_scheme_is_html(
     mock_raise_for_status: mock.Mock, url: str
 ) -> None:
     """
-    `_get_html_response()` should work with archive-like URLs if the HEAD
+    `_get_simple_response()` should work with archive-like URLs if the HEAD
     request is responded with text/html.
     """
     session = mock.Mock(PipSession)
@@ -135,7 +135,7 @@ def test_get_html_response_archive_to_http_scheme_is_html(
     )
     session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"})
 
-    resp = _get_html_response(url, session=session)
+    resp = _get_simple_response(url, session=session)
 
     assert resp is not None
     assert session.mock_calls == [
@@ -163,9 +163,11 @@ def test_get_html_response_archive_to_http_scheme_is_html(
     ],
 )
 @mock.patch("pip._internal.index.collector.raise_for_status")
-def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) -> None:
+def test_get_simple_response_no_head(
+    mock_raise_for_status: mock.Mock, url: str
+) -> None:
     """
-    `_get_html_response()` shouldn't send a HEAD request if the URL does not
+    `_get_simple_response()` shouldn't send a HEAD request if the URL does not
     look like an archive, only the GET request that retrieves data.
     """
     session = mock.Mock(PipSession)
@@ -179,7 +181,7 @@ def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) -
         )
     )
 
-    resp = _get_html_response(url, session=session)
+    resp = _get_simple_response(url, session=session)
 
     assert resp is not None
     assert session.head.call_count == 0
@@ -197,11 +199,11 @@ def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) -
 
 
 @mock.patch("pip._internal.index.collector.raise_for_status")
-def test_get_html_response_dont_log_clear_text_password(
+def test_get_simple_response_dont_log_clear_text_password(
     mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture
 ) -> None:
     """
-    `_get_html_response()` should redact the password from the index URL
+    `_get_simple_response()` should redact the password from the index URL
     in its DEBUG log message.
     """
     session = mock.Mock(PipSession)
@@ -217,7 +219,7 @@ def test_get_html_response_dont_log_clear_text_password(
 
     caplog.set_level(logging.DEBUG)
 
-    resp = _get_html_response(
+    resp = _get_simple_response(
         "https://user:my_password@example.com/simple/", session=session
     )
 
@@ -428,6 +430,7 @@ def _test_parse_links_data_attribute(
     html_bytes = html.encode("utf-8")
     page = IndexContent(
         html_bytes,
+        "text/html",
         encoding=None,
         # parse_links() is cached by url, so we inject a random uuid to ensure
         # the page content isn't cached.
@@ -505,6 +508,7 @@ def test_parse_links_caches_same_page_by_url() -> None:
 
     page_1 = IndexContent(
         html_bytes,
+        "text/html",
         encoding=None,
         url=url,
     )
@@ -512,6 +516,7 @@ def test_parse_links_caches_same_page_by_url() -> None:
     # because the page was cached by url.
     page_2 = IndexContent(
         b"",
+        "text/html",
         encoding=None,
         url=url,
     )
@@ -520,6 +525,7 @@ def test_parse_links_caches_same_page_by_url() -> None:
     # verify that the result is not cached.
     page_3 = IndexContent(
         re.sub(b"pkg1", b"pkg2", html_bytes),
+        "text/html",
         encoding=None,
         url=url,
         cache_link_parsing=False,
@@ -541,7 +547,9 @@ def test_parse_links_caches_same_page_by_url() -> None:
 def test_parse_link_handles_deprecated_usage_properly() -> None:
     html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
     url = "https://example.com/simple/"
-    page = IndexContent(html, encoding=None, url=url, cache_link_parsing=False)
+    page = IndexContent(
+        html, "text/html", encoding=None, url=url, cache_link_parsing=False
+    )
 
     parsed_links = list(parse_links(page, use_deprecated_html5lib=True))
 
@@ -559,7 +567,7 @@ def test_request_http_error(
     session = mock.Mock(PipSession)
     session.get.return_value = mock.Mock()
     mock_raise_for_status.side_effect = NetworkConnectionError("Http error")
-    assert _get_html_page(link, session=session) is None
+    assert _get_index_content(link, session=session) is None
     assert "Could not fetch URL http://localhost: Http error - skipping" in caplog.text
 
 
@@ -568,11 +576,11 @@ def test_request_retries(caplog: pytest.LogCaptureFixture) -> None:
     link = Link("http://localhost")
     session = mock.Mock(PipSession)
     session.get.side_effect = requests.exceptions.RetryError("Retry error")
-    assert _get_html_page(link, session=session) is None
+    assert _get_index_content(link, session=session) is None
     assert "Could not fetch URL http://localhost: Retry error - skipping" in caplog.text
 
 
-def test_make_html_page() -> None:
+def test_make_index_content() -> None:
     headers = {"Content-Type": "text/html; charset=UTF-8"}
     response = mock.Mock(
         content=b"<content>",
@@ -580,7 +588,7 @@ def test_make_html_page() -> None:
         headers=headers,
     )
 
-    actual = _make_html_page(response)
+    actual = _make_index_content(response)
     assert actual.content == b"<content>"
     assert actual.encoding == "UTF-8"
     assert actual.url == "https://example.com/index.html"
@@ -593,15 +601,15 @@ def test_make_html_page() -> None:
         ("git+https://github.com/pypa/pip.git", "git"),
     ],
 )
-def test_get_html_page_invalid_scheme(
+def test_get_index_content_invalid_scheme(
     caplog: pytest.LogCaptureFixture, url: str, vcs_scheme: str
 ) -> None:
-    """`_get_html_page()` should error if an invalid scheme is given.
+    """`_get_index_content()` should error if an invalid scheme is given.
 
     Only file:, http:, https:, and ftp: are allowed.
     """
     with caplog.at_level(logging.WARNING):
-        page = _get_html_page(Link(url), session=mock.Mock(PipSession))
+        page = _get_index_content(Link(url), session=mock.Mock(PipSession))
 
     assert page is None
     assert caplog.record_tuples == [
@@ -622,12 +630,12 @@ def test_get_html_page_invalid_scheme(
     ],
 )
 @mock.patch("pip._internal.index.collector.raise_for_status")
-def test_get_html_page_invalid_content_type(
+def test_get_index_content_invalid_content_type(
     mock_raise_for_status: mock.Mock,
     caplog: pytest.LogCaptureFixture,
     content_type: str,
 ) -> None:
-    """`_get_html_page()` should warn if an invalid content-type is given.
+    """`_get_index_content()` should warn if an invalid content-type is given.
     Only text/html is allowed.
     """
     caplog.set_level(logging.DEBUG)
@@ -641,7 +649,7 @@ def test_get_html_page_invalid_content_type(
             "headers": {"Content-Type": content_type},
         }
     )
-    assert _get_html_page(link, session=session) is None
+    assert _get_index_content(link, session=session) is None
     mock_raise_for_status.assert_called_once_with(session.get.return_value)
     assert (
         "pip._internal.index.collector",
@@ -667,8 +675,8 @@ def make_fake_html_response(url: str) -> mock.Mock:
     return mock.Mock(content=content, url=url, headers={})
 
 
-def test_get_html_page_directory_append_index(tmpdir: Path) -> None:
-    """`_get_html_page()` should append "index.html" to a directory URL."""
+def test_get_index_content_directory_append_index(tmpdir: Path) -> None:
+    """`_get_index_content()` should append "index.html" to a directory URL."""
     dirpath = tmpdir / "something"
     dirpath.mkdir()
     dir_url = dirpath.as_uri()
@@ -676,10 +684,10 @@ def test_get_html_page_directory_append_index(tmpdir: Path) -> None:
 
     session = mock.Mock(PipSession)
     fake_response = make_fake_html_response(expected_url)
-    mock_func = mock.patch("pip._internal.index.collector._get_html_response")
+    mock_func = mock.patch("pip._internal.index.collector._get_simple_response")
     with mock_func as mock_func:
         mock_func.return_value = fake_response
-        actual = _get_html_page(Link(dir_url), session=session)
+        actual = _get_index_content(Link(dir_url), session=session)
         assert mock_func.mock_calls == [
             mock.call(expected_url, session=session),
         ], f"actual calls: {mock_func.mock_calls}"
@@ -779,16 +787,16 @@ def check_links_include(links: List[Link], names: List[str]) -> None:
 
 
 class TestLinkCollector:
-    @mock.patch("pip._internal.index.collector._get_html_response")
-    def test_fetch_page(self, mock_get_html_response: mock.Mock) -> None:
+    @mock.patch("pip._internal.index.collector._get_simple_response")
+    def test_fetch_response(self, mock_get_simple_response: mock.Mock) -> None:
         url = "https://pypi.org/simple/twine/"
 
         fake_response = make_fake_html_response(url)
-        mock_get_html_response.return_value = fake_response
+        mock_get_simple_response.return_value = fake_response
 
         location = Link(url, cache_link_parsing=False)
         link_collector = make_test_link_collector()
-        actual = link_collector.fetch_page(location)
+        actual = link_collector.fetch_response(location)
 
         assert actual is not None
         assert actual.content == fake_response.content
@@ -797,8 +805,8 @@ def test_fetch_page(self, mock_get_html_response: mock.Mock) -> None:
         assert actual.cache_link_parsing == location.cache_link_parsing
 
         # Also check that the right session object was passed to
-        # _get_html_response().
-        mock_get_html_response.assert_called_once_with(
+        # _get_simple_response().
+        mock_get_simple_response.assert_called_once_with(
             url,
             session=link_collector.session,
         )

From 9c98346df96f1fb93e36bc9bc4fa01c4d0566f04 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:54:02 -0400
Subject: [PATCH 12/21] Add new accept headers

---
 tests/unit/test_collector.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index f357c588fe0..93e0efbdf99 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -33,6 +33,15 @@
 from tests.lib import TestData, make_test_link_collector
 
 
+ACCEPT = ", ".join(
+    [
+        "application/vnd.pypi.simple.v1+json",
+        "application/vnd.pypi.simple.v1+html; q=0.1",
+        "text/html; q=0.01",
+    ]
+)
+
+
 @pytest.mark.parametrize(
     "url",
     [
@@ -143,7 +152,7 @@ def test_get_simple_response_archive_to_http_scheme_is_html(
         mock.call.get(
             url,
             headers={
-                "Accept": "text/html",
+                "Accept": ACCEPT,
                 "Cache-Control": "max-age=0",
             },
         ),
@@ -189,7 +198,7 @@ def test_get_simple_response_no_head(
         mock.call(
             url,
             headers={
-                "Accept": "text/html",
+                "Accept": ACCEPT,
                 "Cache-Control": "max-age=0",
             },
         ),

From 799c88f13da710eda20d3d0ad15e4614c26e0120 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 14:58:32 -0400
Subject: [PATCH 13/21] Update some tests to work again

---
 tests/unit/test_collector.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index 93e0efbdf99..5ec1f93c424 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -235,12 +235,17 @@ def test_get_simple_response_dont_log_clear_text_password(
     assert resp is not None
     mock_raise_for_status.assert_called_once_with(resp)
 
-    assert len(caplog.records) == 1
+    assert len(caplog.records) == 2
     record = caplog.records[0]
     assert record.levelname == "DEBUG"
     assert record.message.splitlines() == [
         "Getting page https://user:****@example.com/simple/",
     ]
+    record = caplog.records[1]
+    assert record.levelname == "DEBUG"
+    assert record.message.splitlines() == [
+        "Fetched page https://user:****@example.com/simple/ as text/html",
+    ]
 
 
 @pytest.mark.parametrize(
@@ -664,7 +669,8 @@ def test_get_index_content_invalid_content_type(
         "pip._internal.index.collector",
         logging.WARNING,
         "Skipping page {} because the GET request got Content-Type: {}."
-        "The only supported Content-Type is text/html".format(url, content_type),
+        "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
+        "application/vnd.pypi.simple.v1+html, and text/html'".format(url, content_type),
     ) in caplog.record_tuples
 
 
@@ -681,7 +687,7 @@ def make_fake_html_response(url: str) -> mock.Mock:
     """
     )
     content = html.encode("utf-8")
-    return mock.Mock(content=content, url=url, headers={})
+    return mock.Mock(content=content, url=url, headers={"Content-Type": "text/html"})
 
 
 def test_get_index_content_directory_append_index(tmpdir: Path) -> None:

From d67cac3c1337ebd6b64d58e3365a04c9e392b4f9 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 15:06:08 -0400
Subject: [PATCH 14/21] formatting

---
 tests/unit/test_collector.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index 5ec1f93c424..93a074409b8 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -32,7 +32,6 @@
 from pip._internal.network.session import PipSession
 from tests.lib import TestData, make_test_link_collector
 
-
 ACCEPT = ", ".join(
     [
         "application/vnd.pypi.simple.v1+json",

From 6f9ccfc1367a2ef043e05586c4a95845c7ea2566 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sat, 25 Jun 2022 15:37:43 -0400
Subject: [PATCH 15/21] Fix link hashes

---
 src/pip/_internal/index/collector.py | 13 +------------
 src/pip/_internal/models/link.py     | 22 ++++++++++++++++++----
 tests/unit/test_collector.py         |  4 ++--
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index b896a57bfdf..956ac34353a 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -34,7 +34,7 @@
 from pip._vendor.requests.exceptions import RetryError, SSLError
 
 from pip._internal.exceptions import NetworkConnectionError
-from pip._internal.models.link import SUPPORTED_HASHES, Link
+from pip._internal.models.link import Link
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
@@ -274,11 +274,6 @@ def _clean_link(url: str) -> str:
     return urllib.parse.urlunparse(result._replace(path=path))
 
 
-_HASH_RE = re.compile(
-    r"({choices})=([a-f0-9]+)".format(choices="|".join(SUPPORTED_HASHES))
-)
-
-
 def _create_link_from_element(
     element_attribs: Dict[str, Optional[str]],
     page_url: str,
@@ -295,17 +290,11 @@ def _create_link_from_element(
     pyrequire = element_attribs.get("data-requires-python")
     yanked_reason = element_attribs.get("data-yanked")
 
-    hashes = {}
-    hm = _HASH_RE.search(url)
-    if hm is not None:
-        hashes[hm.group(1).lower()] = hm.group(2)
-
     link = Link(
         url,
         comes_from=page_url,
         requires_python=pyrequire,
         yanked_reason=yanked_reason,
-        hashes=hashes,
     )
 
     return link
diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
index c44ae1be3f5..655a4072d18 100644
--- a/src/pip/_internal/models/link.py
+++ b/src/pip/_internal/models/link.py
@@ -33,7 +33,7 @@
 
 # Order matters, earlier hashes have a precedence over later hashes for what
 # we will pick to use.
-SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5")
+_SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5")
 
 
 class Link(KeyBasedCompareMixin):
@@ -179,18 +179,32 @@ def subdirectory_fragment(self) -> Optional[str]:
             return None
         return match.group(1)
 
+    _hash_re = re.compile(
+        r"({choices})=([a-f0-9]+)".format(choices="|".join(_SUPPORTED_HASHES))
+    )
+
     @property
     def hash(self) -> Optional[str]:
-        for hashname in SUPPORTED_HASHES:
+        for hashname in _SUPPORTED_HASHES:
             if hashname in self._hashes:
                 return self._hashes[hashname]
+
+        match = self._hash_re.search(self._url)
+        if match:
+            return match.group(2)
+
         return None
 
     @property
     def hash_name(self) -> Optional[str]:
-        for hashname in SUPPORTED_HASHES:
+        for hashname in _SUPPORTED_HASHES:
             if hashname in self._hashes:
                 return hashname
+
+        match = self._hash_re.search(self._url)
+        if match:
+            return match.group(1)
+
         return None
 
     @property
@@ -284,7 +298,7 @@ def _clean_link(link: Link) -> _CleanResult:
         subdirectory = ""
     # If there are multiple hash values under the same algorithm, use the
     # first one. This matches the behavior of Link.hash_value.
-    hashes = {k: fragment[k][0] for k in SUPPORTED_HASHES if k in fragment}
+    hashes = {k: fragment[k][0] for k in _SUPPORTED_HASHES if k in fragment}
     return _CleanResult(
         parsed=parsed._replace(netloc=netloc, query="", fragment=""),
         query=urllib.parse.parse_qs(parsed.query),
diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index 93a074409b8..240b53d04b9 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -201,7 +201,7 @@ def test_get_simple_response_no_head(
                 "Cache-Control": "max-age=0",
             },
         ),
-        mock.call().headers.get("Content-Type", ""),
+        mock.call().headers.get("Content-Type", "Unknown"),
     ]
     mock_raise_for_status.assert_called_once_with(resp)
 
@@ -667,7 +667,7 @@ def test_get_index_content_invalid_content_type(
     assert (
         "pip._internal.index.collector",
         logging.WARNING,
-        "Skipping page {} because the GET request got Content-Type: {}."
+        "Skipping page {} because the GET request got Content-Type: {}. "
         "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
         "application/vnd.pypi.simple.v1+html, and text/html'".format(url, content_type),
     ) in caplog.record_tuples

From 7cb5aae8809ee45e3d3f2a0579db73b67717a86b Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sun, 26 Jun 2022 02:28:31 -0400
Subject: [PATCH 16/21] fix a stray '

---
 tests/unit/test_collector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index 240b53d04b9..381cde6eaa8 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -669,7 +669,7 @@ def test_get_index_content_invalid_content_type(
         logging.WARNING,
         "Skipping page {} because the GET request got Content-Type: {}. "
         "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
-        "application/vnd.pypi.simple.v1+html, and text/html'".format(url, content_type),
+        "application/vnd.pypi.simple.v1+html, and text/html".format(url, content_type),
     ) in caplog.record_tuples
 
 

From 5f3fbf040ca58b9794c3f701665cc10b1495ba37 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sun, 26 Jun 2022 02:30:25 -0400
Subject: [PATCH 17/21] Always use Unknown when unknown

---
 src/pip/_internal/index/collector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index 956ac34353a..0d3d087fd84 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -80,7 +80,7 @@ def _ensure_api_header(response: Response) -> None:
 
     Raises `_NotAPIContent` if the content type is not a valid content-type.
     """
-    content_type = response.headers.get("Content-Type", "")
+    content_type = response.headers.get("Content-Type", "Unknown")
 
     content_type_l = content_type.lower()
     if content_type_l.startswith(

From 68836d479fc3d4626a8827ef8672df4a34d52baa Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Sun, 26 Jun 2022 02:36:36 -0400
Subject: [PATCH 18/21] fix number of calls

---
 tests/unit/test_collector.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index 381cde6eaa8..e49c47c54b2 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -202,6 +202,7 @@ def test_get_simple_response_no_head(
             },
         ),
         mock.call().headers.get("Content-Type", "Unknown"),
+        mock.call().headers.get("Content-Type", "Unknown"),
     ]
     mock_raise_for_status.assert_called_once_with(resp)
 

From 41051f5f7bd084b8aea2a0d42563f5470f10ffb6 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Fri, 15 Jul 2022 17:40:52 -0400
Subject: [PATCH 19/21] Better expand on our TODO

---
 src/pip/_internal/index/collector.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
index 0d3d087fd84..04646ae1121 100644
--- a/src/pip/_internal/index/collector.py
+++ b/src/pip/_internal/index/collector.py
@@ -525,9 +525,12 @@ def _get_index_content(
         # final segment
         if not url.endswith("/"):
             url += "/"
+        # TODO: In the future, it would be nice if pip supported PEP 691
+        #       style respones in the file:// URLs, however there's no
+        #       standard file extension for application/vnd.pypi.simple.v1+json
+        #       so we'll need to come up with something on our own.
         url = urllib.parse.urljoin(url, "index.html")
         logger.debug(" file: URL is directory, getting %s", url)
-        # TODO: index.json?
 
     try:
         resp = _get_simple_response(url, session=session)

From c1b46c19cfb11e2b5b444fc0da0b8d19b9d1df1a Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Fri, 15 Jul 2022 17:43:17 -0400
Subject: [PATCH 20/21] Document the hashes parameter

---
 src/pip/_internal/models/link.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
index 655a4072d18..8fd1c3d9960 100644
--- a/src/pip/_internal/models/link.py
+++ b/src/pip/_internal/models/link.py
@@ -77,6 +77,8 @@ def __init__(
                                    should be cached. PyPI index urls should
                                    generally have this set to False, for
                                    example.
+        :param hashes: A mapping of hash names to digests to allow us to
+                       determine the validity of a download.
         """
 
         # url can be a UNC windows share

From c1b50e1f4dcfe7803e0a4bd12c6ee01873916bd0 Mon Sep 17 00:00:00 2001
From: Donald Stufft <donald@stufft.io>
Date: Fri, 15 Jul 2022 17:55:15 -0400
Subject: [PATCH 21/21] Add a test for parsing links from JSON

---
 tests/unit/test_collector.py | 52 ++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
index e49c47c54b2..eff2594cad9 100644
--- a/tests/unit/test_collector.py
+++ b/tests/unit/test_collector.py
@@ -1,4 +1,5 @@
 import itertools
+import json
 import logging
 import os
 import re
@@ -481,6 +482,57 @@ def test_parse_links__requires_python(
     _test_parse_links_data_attribute(anchor_html, "requires_python", expected)
 
 
+def test_parse_links_json() -> None:
+    json_bytes = json.dumps(
+        {
+            "meta": {"api-version": "1.0"},
+            "name": "holygrail",
+            "files": [
+                {
+                    "filename": "holygrail-1.0.tar.gz",
+                    "url": "https://example.com/files/holygrail-1.0.tar.gz",
+                    "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
+                    "requires-python": ">=3.7",
+                    "yanked": "Had a vulnerability",
+                },
+                {
+                    "filename": "holygrail-1.0-py3-none-any.whl",
+                    "url": "/files/holygrail-1.0-py3-none-any.whl",
+                    "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
+                    "requires-python": ">=3.7",
+                    "dist-info-metadata": False,
+                },
+            ],
+        }
+    ).encode("utf8")
+    page = IndexContent(
+        json_bytes,
+        "application/vnd.pypi.simple.v1+json",
+        encoding=None,
+        # parse_links() is cached by url, so we inject a random uuid to ensure
+        # the page content isn't cached.
+        url=f"https://example.com/simple-{uuid.uuid4()}/",
+    )
+    links = list(parse_links(page, use_deprecated_html5lib=False))
+
+    assert links == [
+        Link(
+            "https://example.com/files/holygrail-1.0.tar.gz",
+            comes_from=page.url,
+            requires_python=">=3.7",
+            yanked_reason="Had a vulnerability",
+            hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
+        ),
+        Link(
+            "https://example.com/files/holygrail-1.0-py3-none-any.whl",
+            comes_from=page.url,
+            requires_python=">=3.7",
+            yanked_reason=None,
+            hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
+        ),
+    ]
+
+
 @pytest.mark.parametrize(
     "anchor_html, expected",
     [