add PEP 658 support!!!

move url cleaning to link.py use a nice dataclass to decouple hash parsing from Link avoid downloading wheels when testing the resolver in isolation avoid special-casing the python version requirement in download.py streamline the RequirementSetWithCandidates invocation restore _clean_link method from collector.py to pass tests
pypa · Dec 28, 2021 · 8c755e8 · 8c755e8
1 parent d4ccc39
commit 8c755e8
Show file tree

Hide file tree

Showing 12 changed files with 400 additions and 174 deletions.
diff --git a/src/pip/_internal/commands/download.py b/src/pip/_internal/commands/download.py
@@ -1,20 +1,99 @@
 import json
 import logging
 import os
+from dataclasses import dataclass, field
 from optparse import Values
-from typing import Dict, List
+from typing import Any, Dict, List, Optional, Union
+
+from pip._vendor.packaging.requirements import Requirement
 
 from pip._internal.cli import cmdoptions
 from pip._internal.cli.cmdoptions import make_target_python
 from pip._internal.cli.req_command import RequirementCommand, with_cleanup
 from pip._internal.cli.status_codes import SUCCESS
+from pip._internal.models.link import Link, LinkHash
 from pip._internal.req.req_tracker import get_requirement_tracker
+from pip._internal.resolution.base import RequirementSetWithCandidates
 from pip._internal.utils.misc import ensure_dir, normalize_path, write_output
 from pip._internal.utils.temp_dir import TempDirectory
 
 logger = logging.getLogger(__name__)
 
 
+@dataclass(frozen=True)
+class DistInfoMetadata:
+    """???/From PEP 658"""
+
+    metadata_url: str
+    metadata_hash: Optional[LinkHash]
+
+    @classmethod
+    def from_link(cls, link: Link) -> Optional["DistInfoMetadata"]:
+        if link.dist_info_metadata is None:
+            return None
+
+        metadata_url = f"{link.url_without_fragment}.metadata"
+        if link.dist_info_metadata == "true":
+            metadata_hash = None
+        else:
+            metadata_hash = LinkHash.split_hash_name_and_value(link.dist_info_metadata)
+
+        return cls(metadata_url=metadata_url, metadata_hash=metadata_hash)
+
+    def as_json(self) -> Dict[str, Union[str, Optional[Dict[str, str]]]]:
+        return {
+            "metadata_url": self.metadata_url,
+            "metadata_hash": (
+                self.metadata_hash.as_json() if self.metadata_hash else None
+            ),
+        }
+
+
+@dataclass(frozen=True)
+class RequirementDownloadInfo:
+    req: Requirement
+    url: str
+    file_hash: Optional[LinkHash]
+    dist_info_metadata: Optional[DistInfoMetadata]
+
+    @classmethod
+    def from_req_and_link(
+        cls,
+        req: Requirement,
+        link: Link,
+    ) -> "RequirementDownloadInfo":
+        return cls(
+            req=req,
+            url=link.url,
+            file_hash=link.get_link_hash(),
+            dist_info_metadata=DistInfoMetadata.from_link(link),
+        )
+
+    def as_json(self) -> Dict[str, Any]:
+        return {
+            "req": str(self.req),
+            "url": self.url,
+            "hash": self.file_hash and self.file_hash.as_json(),
+            "dist_info_metadata": (
+                self.dist_info_metadata and self.dist_info_metadata.as_json()
+            ),
+        }
+
+
+@dataclass
+class DownloadInfos:
+    implicit_requirements: List[Requirement] = field(default_factory=list)
+    resolution: Dict[str, RequirementDownloadInfo] = field(default_factory=dict)
+
+    def as_json(self) -> Dict[str, Any]:
+        return {
+            "implicit_requirements": [str(req) for req in self.implicit_requirements],
+            "resolution": {
+                name: info.as_json() for name, info in self.resolution.items()
+            },
+        }
+
+
 class DownloadCommand(RequirementCommand):
     """
     Download packages from:
@@ -149,24 +228,46 @@ def run(self, options: Values, args: List[str]) -> int:
         requirement_set = resolver.resolve(reqs, check_supported_wheels=True)
 
         downloaded: List[str] = []
-        download_infos: List[Dict[str, str]] = []
         for req in requirement_set.requirements.values():
+            # If this distribution was not already satisfied, that means we
+            # downloaded it.
             if req.satisfied_by is None:
-                assert req.name is not None
-                assert req.link is not None
-                download_infos.append(
-                    {
-                        "name": req.name,
-                        "url": req.link.url,
-                    }
-                )
                 preparer.save_linked_requirement(req)
+                assert req.name is not None
                 downloaded.append(req.name)
 
+        download_infos = DownloadInfos()
+        if options.print_download_urls:
+            if isinstance(requirement_set, RequirementSetWithCandidates):
+                for candidate in requirement_set.candidates.mapping.values():
+                    # This will occur for the python version requirement, for example.
+                    if candidate.name not in requirement_set.requirements:
+                        download_infos.implicit_requirements.append(
+                            candidate.as_serializable_requirement()
+                        )
+                        continue
+                    req = requirement_set.requirements[candidate.name]
+                    assert req.name is not None
+                    assert req.link is not None
+                    assert req.name not in download_infos.resolution
+                    download_infos.resolution[
+                        req.name
+                    ] = RequirementDownloadInfo.from_req_and_link(
+                        req=candidate.as_serializable_requirement(),
+                        link=req.link,
+                    )
+            else:
+                logger.warning(
+                    "--print-download-urls is being used with the legacy resolver. "
+                    "The legacy resolver does not retain detailed dependency "
+                    "information, so all the fields in the output JSON file "
+                    "will be empty."
+                )
+
         if downloaded:
             write_output("Successfully downloaded %s", " ".join(downloaded))
         if options.print_download_urls:
             with open(options.print_download_urls, "w") as f:
-                json.dump(download_infos, f, indent=4)
+                json.dump(download_infos.as_json(), f, indent=4)
 
         return SUCCESS
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -8,10 +8,8 @@
 import itertools
 import logging
 import os
-import re
 import urllib.parse
 import urllib.request
-import xml.etree.ElementTree
 from optparse import Values
 from typing import (
     Callable,
@@ -29,19 +27,18 @@
 from pip._vendor.requests.exceptions import RetryError, SSLError
 
 from pip._internal.exceptions import NetworkConnectionError
-from pip._internal.models.link import Link
+from pip._internal.models.link import HTMLElement, Link
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
 from pip._internal.utils.filetypes import is_archive_file
-from pip._internal.utils.misc import pairwise, redact_auth_from_url
+from pip._internal.utils.misc import redact_auth_from_url
 from pip._internal.vcs import vcs
 
 from .sources import CandidatesFromPage, LinkSource, build_source
 
 logger = logging.getLogger(__name__)
 
-HTMLElement = xml.etree.ElementTree.Element
 ResponseHeaders = MutableMapping[str, str]
 
 
@@ -171,94 +168,6 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
     return page_url
 
 
-def _clean_url_path_part(part: str) -> str:
-    """
-    Clean a "part" of a URL path (i.e. after splitting on "@" characters).
-    """
-    # We unquote prior to quoting to make sure nothing is double quoted.
-    return urllib.parse.quote(urllib.parse.unquote(part))
-
-
-def _clean_file_url_path(part: str) -> str:
-    """
-    Clean the first part of a URL path that corresponds to a local
-    filesystem path (i.e. the first part after splitting on "@" characters).
-    """
-    # We unquote prior to quoting to make sure nothing is double quoted.
-    # Also, on Windows the path part might contain a drive letter which
-    # should not be quoted. On Linux where drive letters do not
-    # exist, the colon should be quoted. We rely on urllib.request
-    # to do the right thing here.
-    return urllib.request.pathname2url(urllib.request.url2pathname(part))
-
-
-# percent-encoded:                   /
-_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
-
-
-def _clean_url_path(path: str, is_local_path: bool) -> str:
-    """
-    Clean the path portion of a URL.
-    """
-    if is_local_path:
-        clean_func = _clean_file_url_path
-    else:
-        clean_func = _clean_url_path_part
-
-    # Split on the reserved characters prior to cleaning so that
-    # revision strings in VCS URLs are properly preserved.
-    parts = _reserved_chars_re.split(path)
-
-    cleaned_parts = []
-    for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
-        cleaned_parts.append(clean_func(to_clean))
-        # Normalize %xx escapes (e.g. %2f -> %2F)
-        cleaned_parts.append(reserved.upper())
-
-    return "".join(cleaned_parts)
-
-
-def _clean_link(url: str) -> str:
-    """
-    Make sure a link is fully quoted.
-    For example, if ' ' occurs in the URL, it will be replaced with "%20",
-    and without double-quoting other characters.
-    """
-    # Split the URL into parts according to the general structure
-    # `scheme://netloc/path;parameters?query#fragment`.
-    result = urllib.parse.urlparse(url)
-    # If the netloc is empty, then the URL refers to a local filesystem path.
-    is_local_path = not result.netloc
-    path = _clean_url_path(result.path, is_local_path=is_local_path)
-    return urllib.parse.urlunparse(result._replace(path=path))
-
-
-def _create_link_from_element(
-    anchor: HTMLElement,
-    page_url: str,
-    base_url: str,
-) -> Optional[Link]:
-    """
-    Convert an anchor element in a simple repository page to a Link.
-    """
-    href = anchor.get("href")
-    if not href:
-        return None
-
-    url = _clean_link(urllib.parse.urljoin(base_url, href))
-    pyrequire = anchor.get("data-requires-python")
-    yanked_reason = anchor.get("data-yanked")
-
-    link = Link(
-        url,
-        comes_from=page_url,
-        requires_python=pyrequire,
-        yanked_reason=yanked_reason,
-    )
-
-    return link
-
-
 class CacheablePageContent:
     def __init__(self, page: "HTMLPage") -> None:
         assert page.cache_link_parsing
@@ -307,11 +216,7 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]:
     url = page.url
     base_url = _determine_base_url(document, url)
     for anchor in document.findall(".//a"):
-        link = _create_link_from_element(
-            anchor,
-            page_url=url,
-            base_url=base_url,
-        )
+        link = Link.from_element(anchor, page_url=url, base_url=base_url)
         if link is None:
             continue
         yield link

diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py
@@ -101,6 +101,9 @@ def __repr__(self) -> str:
     def __str__(self) -> str:
         return f"{self.raw_name} {self.version}"
 
+    def as_serializable_requirement(self) -> Requirement:
+        raise NotImplementedError()
+
     @property
     def location(self) -> Optional[str]:
         """Where the distribution is loaded from.

diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py
@@ -120,6 +120,9 @@ def from_wheel(cls, wheel: Wheel, name: str) -> "Distribution":
         )
         return cls(dist)
 
+    def as_serializable_requirement(self) -> Requirement:
+        return self._dist.as_requirement()
+
     @property
     def location(self) -> Optional[str]:
         return self._dist.location