Skip to content

Commit

Permalink
make LinkMetadataCache
Browse files Browse the repository at this point in the history
- catch an exception when parsing metadata which only occurs in CI
- handle --no-cache-dir
- call os.makedirs() before writing to cache too
- catch InvalidSchema when attempting git urls with BatchDownloader
- fix other test failures
- reuse should_cache(req) logic
- gzip compress link metadata for a slight reduction in disk space
- only cache built sdists
- don't check should_cache() when fetching
- cache lazy wheel dists
- add news
- turn debug logs in fetching from cache into exceptions
- use scandir over listdir when searching normal wheel cache
- handle metadata email parsing errors
- correctly handle mutable cached requirement
- use bz2 over gzip for an extremely slight improvement in disk usage
  • Loading branch information
cosmicexplorer committed Jan 17, 2024
1 parent dc4582a commit 36b0fa8
Show file tree
Hide file tree
Showing 21 changed files with 391 additions and 153 deletions.
1 change: 1 addition & 0 deletions news/12256.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cache computed metadata from sdists and lazy wheels in ``~/.cache/pip/link-metadata`` when ``--use-feature=metadata-cache`` is enabled.
118 changes: 102 additions & 16 deletions src/pip/_internal/cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Cache Management
"""

import abc
import hashlib
import json
import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Dict, Iterator, List, Optional, Tuple

from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
from pip._vendor.packaging.utils import canonicalize_name
Expand All @@ -15,21 +17,71 @@
from pip._internal.models.direct_url import DirectUrl
from pip._internal.models.link import Link
from pip._internal.models.wheel import Wheel
from pip._internal.req.req_install import InstallRequirement
from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
from pip._internal.utils.urls import path_to_url
from pip._internal.vcs import vcs

logger = logging.getLogger(__name__)

_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE)

ORIGIN_JSON_NAME = "origin.json"


def _contains_egg_info(s: str) -> bool:
"""Determine whether the string looks like an egg_info.
:param s: The string to parse. E.g. foo-2.1
"""
return bool(_egg_info_re.search(s))


def should_cache(
req: InstallRequirement,
) -> bool:
"""
Return whether a built InstallRequirement can be stored in the persistent
wheel cache, assuming the wheel cache is available, and _should_build()
has determined a wheel needs to be built.
"""
if not req.link:
return False

if req.link.is_wheel:
return False

if req.editable or not req.source_dir:
# never cache editable requirements
return False

if req.link and req.link.is_vcs:
# VCS checkout. Do not cache
# unless it points to an immutable commit hash.
assert not req.editable
assert req.source_dir
vcs_backend = vcs.get_backend_for_scheme(req.link.scheme)
assert vcs_backend
if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir):
return True
return False

assert req.link
base, ext = req.link.splitext()
if _contains_egg_info(base):
return True

# Otherwise, do not cache.
return False


def _hash_dict(d: Dict[str, str]) -> str:
"""Return a stable sha224 of a dictionary."""
s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
return hashlib.sha224(s.encode("ascii")).hexdigest()


class Cache:
class Cache(abc.ABC):
"""An abstract class - provides cache directories for data from links
:param cache_dir: The root of the cache.
Expand Down Expand Up @@ -73,20 +125,28 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:

return parts

def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
can_not_cache = not self.cache_dir or not canonical_package_name or not link
if can_not_cache:
return []
@abc.abstractmethod
def get_path_for_link(self, link: Link) -> str:
"""Return a directory to store cached items in for link."""
...

def cache_path(self, link: Link) -> Path:
return Path(self.get_path_for_link(link))

path = self.get_path_for_link(link)
if os.path.isdir(path):
return [(candidate, path) for candidate in os.listdir(path)]
return []

class LinkMetadataCache(Cache):
"""Persistently store the metadata of dists found at each link."""

def get_path_for_link(self, link: Link) -> str:
"""Return a directory to store cached items in for link."""
raise NotImplementedError()
parts = self._get_cache_path_parts(link)
assert self.cache_dir
return os.path.join(self.cache_dir, "link-metadata", *parts)


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""

@abc.abstractmethod
def get(
self,
link: Link,
Expand All @@ -96,10 +156,27 @@ def get(
"""Returns a link to a cached item if it exists, otherwise returns the
passed link.
"""
raise NotImplementedError()
...

def _can_cache(self, link: Link, canonical_package_name: str) -> bool:
return bool(self.cache_dir and canonical_package_name and link)

def _get_candidates(
self, link: Link, canonical_package_name: str
) -> Iterator[Tuple[str, str]]:
if not self._can_cache(link, canonical_package_name):
return

path = self.get_path_for_link(link)
if not os.path.isdir(path):
return

class SimpleWheelCache(Cache):
for candidate in os.scandir(path):
if candidate.is_file():
yield (candidate.name, path)


class SimpleWheelCache(WheelCacheBase):
"""A cache of wheels for future installs."""

def __init__(self, cache_dir: str) -> None:
Expand Down Expand Up @@ -131,7 +208,7 @@ def get(
package_name: Optional[str],
supported_tags: List[Tag],
) -> Link:
candidates = []
candidates: List[Tuple[int, str, str]] = []

if not package_name:
return link
Expand Down Expand Up @@ -205,7 +282,7 @@ def __init__(
)


class WheelCache(Cache):
class WheelCache(WheelCacheBase):
"""Wraps EphemWheelCache and SimpleWheelCache into a single Cache
This Cache allows for gracefully degradation, using the ephem wheel cache
Expand All @@ -223,6 +300,15 @@ def get_path_for_link(self, link: Link) -> str:
def get_ephem_path_for_link(self, link: Link) -> str:
return self._ephem_cache.get_path_for_link(link)

def resolve_cache_dir(self, req: InstallRequirement) -> str:
"""Return the persistent or temporary cache directory where the built or
downloaded wheel should be stored."""
cache_available = bool(self.cache_dir)
assert req.link, req
if cache_available and should_cache(req):
return self.get_path_for_link(req.link)
return self.get_ephem_path_for_link(req.link)

def get(
self,
link: Link,
Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,7 @@ def check_list_path_option(options: Values) -> None:
default=[],
choices=[
"fast-deps",
"metadata-cache",
"truststore",
]
+ ALWAYS_ENABLED_FEATURES,
Expand Down
13 changes: 12 additions & 1 deletion src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from optparse import Values
from typing import TYPE_CHECKING, Any, List, Optional, Tuple

from pip._internal.cache import WheelCache
from pip._internal.cache import LinkMetadataCache, WheelCache
from pip._internal.cli import cmdoptions
from pip._internal.cli.base_command import Command
from pip._internal.cli.command_context import CommandContextMixIn
Expand Down Expand Up @@ -305,6 +305,16 @@ def make_requirement_preparer(
"fast-deps has no effect when used with the legacy resolver."
)

if options.cache_dir and "metadata-cache" in options.features_enabled:
logger.warning(
"pip is using a local cache for metadata information. "
"This experimental feature is enabled through "
"--use-feature=metadata-cache and it is not ready for "
"production."
)
metadata_cache = LinkMetadataCache(options.cache_dir)
else:
metadata_cache = None
return RequirementPreparer(
build_dir=temp_build_dir_path,
src_dir=options.src_dir,
Expand All @@ -320,6 +330,7 @@ def make_requirement_preparer(
lazy_wheel=lazy_wheel,
verbosity=verbosity,
legacy_resolver=legacy_resolver,
metadata_cache=metadata_cache,
)

@classmethod
Expand Down
19 changes: 19 additions & 0 deletions src/pip/_internal/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,25 @@ def __str__(self) -> str:
return f"None {self.metadata_name} metadata found for distribution: {self.dist}"


class CacheMetadataError(PipError):
"""Raised when de/serializing a requirement into the metadata cache."""

def __init__(
self,
req: "InstallRequirement",
reason: str,
) -> None:
"""
:param req: The requirement we attempted to cache.
:param reason: Context about the precise error that occurred.
"""
self.req = req
self.reason = reason

def __str__(self) -> str:
return f"{self.reason} for {self.req} from {self.req.link}"


class UserInstallationInvalid(InstallationError):
"""A --user install is requested on an environment without user site."""

Expand Down
10 changes: 9 additions & 1 deletion src/pip/_internal/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@

from pip._internal.utils.misc import strtobool

from .base import BaseDistribution, BaseEnvironment, FilesystemWheel, MemoryWheel, Wheel
from .base import (
BaseDistribution,
BaseEnvironment,
FilesystemWheel,
MemoryWheel,
Wheel,
serialize_metadata,
)

if TYPE_CHECKING:
from typing import Literal, Protocol
Expand All @@ -23,6 +30,7 @@
"get_environment",
"get_wheel_distribution",
"select_backend",
"serialize_metadata",
]


Expand Down
15 changes: 15 additions & 0 deletions src/pip/_internal/metadata/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import csv
import email.generator
import email.message
import email.policy
import functools
import io
import json
import logging
import pathlib
Expand Down Expand Up @@ -97,6 +100,18 @@ def _convert_installed_files_path(
return str(pathlib.Path(*info, *entry))


def serialize_metadata(msg: email.message.Message) -> str:
"""Write a dist's metadata to a string.
Calling ``str(dist.metadata)`` may raise an error by misinterpreting RST directives
as email headers. This method uses the more robust ``email.policy.EmailPolicy`` to
avoid those parsing errors."""
out = io.StringIO()
g = email.generator.Generator(out, policy=email.policy.EmailPolicy())
g.flatten(msg)
return out.getvalue()


class RequiresEntry(NamedTuple):
requirement: str
extra: str
Expand Down
2 changes: 1 addition & 1 deletion src/pip/_internal/network/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _get_http_response_filename(resp: Response, link: Link) -> str:


def _http_get_download(session: PipSession, link: Link) -> Response:
target_url = link.url.split("#", 1)[0]
target_url = link.url_without_fragment
resp = session.get(target_url, headers=HEADERS, stream=True)
raise_for_status(resp)
return resp
Expand Down
Loading

0 comments on commit 36b0fa8

Please sign in to comment.