Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache parse_links() by --find-links html page url #7729

Merged
merged 1 commit into from
Apr 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions news/7729.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run.

This change significantly improves resolve performance when --find-links points to a very large html page.
96 changes: 86 additions & 10 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import cgi
import functools
import itertools
import logging
import mimetypes
Expand All @@ -25,8 +26,8 @@

if MYPY_CHECK_RUNNING:
from typing import (
Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
Union,
Callable, Iterable, List, MutableMapping, Optional,
Protocol, Sequence, Tuple, TypeVar, Union,
)
import xml.etree.ElementTree

Expand All @@ -38,10 +39,31 @@
HTMLElement = xml.etree.ElementTree.Element
ResponseHeaders = MutableMapping[str, str]

# Used in the @lru_cache polyfill.
F = TypeVar('F')

class LruCache(Protocol):
def __call__(self, maxsize=None):
# type: (Optional[int]) -> Callable[[F], F]
raise NotImplementedError


logger = logging.getLogger(__name__)


# Fallback to noop_lru_cache in Python 2
# TODO: this can be removed when python 2 support is dropped!
def noop_lru_cache(maxsize=None):
# type: (Optional[int]) -> Callable[[F], F]
def _wrapper(f):
# type: (F) -> F
return f
return _wrapper


cosmicexplorer marked this conversation as resolved.
Show resolved Hide resolved
_lru_cache = getattr(functools, "lru_cache", noop_lru_cache) # type: LruCache


def _match_vcs_scheme(url):
# type: (str) -> Optional[str]
"""Look for VCS schemes in the URL.
Expand Down Expand Up @@ -285,6 +307,48 @@ def _create_link_from_element(
return link


class CacheablePageContent(object):
def __init__(self, page):
# type: (HTMLPage) -> None
assert page.cache_link_parsing
self.page = page

def __eq__(self, other):
# type: (object) -> bool
return (isinstance(other, type(self)) and
self.page.url == other.page.url)

def __hash__(self):
# type: () -> int
return hash(self.page.url)


def with_cached_html_pages(
fn, # type: Callable[[HTMLPage], Iterable[Link]]
):
# type: (...) -> Callable[[HTMLPage], List[Link]]
"""
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
function's result (keyed by CacheablePageContent), unless the HTMLPage
`page` has `page.cache_link_parsing == False`.
"""

@_lru_cache(maxsize=None)
def wrapper(cacheable_page):
# type: (CacheablePageContent) -> List[Link]
return list(fn(cacheable_page.page))

@functools.wraps(fn)
def wrapper_wrapper(page):
# type: (HTMLPage) -> List[Link]
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page))
return list(fn(page))

return wrapper_wrapper


@with_cached_html_pages
def parse_links(page):
# type: (HTMLPage) -> Iterable[Link]
"""
Expand Down Expand Up @@ -314,18 +378,23 @@ class HTMLPage(object):

def __init__(
self,
content, # type: bytes
encoding, # type: Optional[str]
url, # type: str
content, # type: bytes
encoding, # type: Optional[str]
url, # type: str
cache_link_parsing=True, # type: bool
):
# type: (...) -> None
"""
:param encoding: the encoding to decode the given content.
:param url: the URL from which the HTML was downloaded.
:param cache_link_parsing: whether links parsed from this page's url
should be cached. PyPI index urls should
have this set to False, for example.
"""
self.content = content
self.encoding = encoding
self.url = url
self.cache_link_parsing = cache_link_parsing

def __str__(self):
# type: () -> str
Expand All @@ -343,10 +412,14 @@ def _handle_get_page_fail(
meth("Could not fetch URL %s: %s - skipping", link, reason)


def _make_html_page(response):
# type: (Response) -> HTMLPage
def _make_html_page(response, cache_link_parsing=True):
# type: (Response, bool) -> HTMLPage
encoding = _get_encoding_from_headers(response.headers)
return HTMLPage(response.content, encoding=encoding, url=response.url)
return HTMLPage(
response.content,
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing)


def _get_html_page(link, session=None):
Expand Down Expand Up @@ -399,7 +472,8 @@ def _get_html_page(link, session=None):
except requests.Timeout:
_handle_get_page_fail(link, "timed out")
else:
return _make_html_page(resp)
return _make_html_page(resp,
cache_link_parsing=link.cache_link_parsing)
return None


Expand Down Expand Up @@ -562,7 +636,9 @@ def collect_links(self, project_name):
# We want to filter out anything that does not have a secure origin.
url_locations = [
link for link in itertools.chain(
(Link(url) for url in index_url_loc),
# Mark PyPI indices as "cache_link_parsing == False" -- this
# will avoid caching the result of parsing the page for links.
(Link(url, cache_link_parsing=False) for url in index_url_loc),
(Link(url) for url in fl_url_loc),
)
if self.session.is_secure_origin(link)
Expand Down
8 changes: 8 additions & 0 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def __init__(
comes_from=None, # type: Optional[Union[str, HTMLPage]]
requires_python=None, # type: Optional[str]
yanked_reason=None, # type: Optional[Text]
cache_link_parsing=True, # type: bool
):
# type: (...) -> None
"""
Expand All @@ -46,6 +47,11 @@ def __init__(
a simple repository HTML link. If the file has been yanked but
no reason was provided, this should be the empty string. See
PEP 592 for more information and the specification.
:param cache_link_parsing: A flag that is used elsewhere to determine
whether resources retrieved from this link
should be cached. PyPI index urls should
generally have this set to False, for
example.
"""

# url can be a UNC windows share
Expand All @@ -63,6 +69,8 @@ def __init__(

super(Link, self).__init__(key=url, defining_class=Link)

self.cache_link_parsing = cache_link_parsing

def __str__(self):
# type: () -> str
if self.requires_python:
Expand Down
60 changes: 57 additions & 3 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import os.path
import re
import uuid
from textwrap import dedent

import mock
Expand All @@ -26,7 +28,7 @@
from pip._internal.models.index import PyPI
from pip._internal.models.link import Link
from pip._internal.network.session import PipSession
from tests.lib import make_test_link_collector
from tests.lib import make_test_link_collector, skip_if_python2


@pytest.mark.parametrize(
Expand Down Expand Up @@ -355,14 +357,61 @@ def test_parse_links__yanked_reason(anchor_html, expected):
page = HTMLPage(
html_bytes,
encoding=None,
url='https://example.com/simple/',
# parse_links() is cached by url, so we inject a random uuid to ensure
# the page content isn't cached.
url='https://example.com/simple-{}/'.format(uuid.uuid4()),
)
links = list(parse_links(page))
link, = links
actual = link.yanked_reason
assert actual == expected


@skip_if_python2
def test_parse_links_caches_same_page_by_url():
html = (
'<html><head><meta charset="utf-8"><head>'
'<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
)
html_bytes = html.encode('utf-8')

url = 'https://example.com/simple/'

page_1 = HTMLPage(
html_bytes,
encoding=None,
url=url,
)
# Make a second page with zero content, to ensure that it's not accessed,
# because the page was cached by url.
page_2 = HTMLPage(
b'',
encoding=None,
url=url,
)
# Make a third page which represents an index url, which should not be
# cached, even for the same url. We modify the page content slightly to
# verify that the result is not cached.
page_3 = HTMLPage(
re.sub(b'pkg1', b'pkg2', html_bytes),
encoding=None,
url=url,
cache_link_parsing=False,
)

parsed_links_1 = list(parse_links(page_1))
assert len(parsed_links_1) == 1
assert 'pkg1' in parsed_links_1[0].url

parsed_links_2 = list(parse_links(page_2))
assert parsed_links_2 == parsed_links_1

parsed_links_3 = list(parse_links(page_3))
assert len(parsed_links_3) == 1
assert parsed_links_3 != parsed_links_1
assert 'pkg2' in parsed_links_3[0].url


def test_request_http_error(caplog):
caplog.set_level(logging.DEBUG)
link = Link('http://localhost')
Expand Down Expand Up @@ -528,13 +577,14 @@ def test_fetch_page(self, mock_get_html_response):
fake_response = make_fake_html_response(url)
mock_get_html_response.return_value = fake_response

location = Link(url)
location = Link(url, cache_link_parsing=False)
link_collector = make_test_link_collector()
actual = link_collector.fetch_page(location)

assert actual.content == fake_response.content
assert actual.encoding is None
assert actual.url == url
assert actual.cache_link_parsing == location.cache_link_parsing

# Also check that the right session object was passed to
# _get_html_response().
Expand All @@ -559,8 +609,12 @@ def test_collect_links(self, caplog, data):

assert len(actual.find_links) == 1
check_links_include(actual.find_links, names=['packages'])
# Check that find-links URLs are marked as cacheable.
assert actual.find_links[0].cache_link_parsing

assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
# Check that index URLs are marked as *un*cacheable.
assert not actual.project_urls[0].cache_link_parsing

expected_message = dedent("""\
1 location(s) to search for versions of twine:
Expand Down