Skip to content

Commit

Permalink
Cache parse_links() by --find-links html page content (#5)
Browse files Browse the repository at this point in the history
Cherry-pick of pypa#7729.
Fixes pex-tool/pex#887.
  • Loading branch information
cosmicexplorer authored Feb 14, 2020
1 parent e2553a1 commit 5eb9470
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 2 deletions.
74 changes: 72 additions & 2 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@

if MYPY_CHECK_RUNNING:
from typing import (
Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
Union,
Any, Callable, Dict, Iterable, List, MutableMapping, Optional,
Sequence, Tuple, Union,
)
import xml.etree.ElementTree

Expand All @@ -41,6 +41,32 @@
logger = logging.getLogger(__name__)


def lru_cache(
*args, # type: Any
**kwargs # type: Any
):
# type: (...) -> Any
cache = {} # type: Dict[Any, Any]

def wrapper(fn):
# type: (Any) -> Any

def wrapped(
*args, # type: Any
**kwargs # type: Any
):
# type: (...) -> Any
cache_key = tuple(args) + tuple(kwargs.items())
value = cache.get(cache_key, None)
if value is not None:
return value
value = fn(*args, **kwargs)
cache[cache_key] = value
return value
return wrapped
return wrapper


def _match_vcs_scheme(url):
# type: (str) -> Optional[str]
"""Look for VCS schemes in the URL.
Expand Down Expand Up @@ -244,6 +270,38 @@ def _create_link_from_element(
return link


class CacheablePageContent(object):
def __init__(self, page):
# type: (HTMLPage) -> None
self.page = page

def __eq__(self, other):
# type: (object) -> bool
return (isinstance(other, type(self)) and
self.page.content == other.page.content and
self.page.encoding == other.page.encoding)

def __hash__(self):
# type: () -> int
return hash((self.page.content, self.page.encoding))


def with_cached_html_pages(fn):
# type: (Any) -> Any

@lru_cache(maxsize=None)
def wrapper(cacheable_page):
# type: (CacheablePageContent) -> List[Any]
return list(fn(cacheable_page.page))

def wrapper_wrapper(page):
# type: (HTMLPage) -> List[Any]
return wrapper(CacheablePageContent(page))

return wrapper_wrapper


@with_cached_html_pages
def parse_links(page):
# type: (HTMLPage) -> Iterable[Link]
"""
Expand Down Expand Up @@ -308,6 +366,18 @@ def _make_html_page(response):
return HTMLPage(response.content, encoding=encoding, url=response.url)


def with_cached_link_fetch(fn):
# type: (Any) -> Any

@lru_cache(maxsize=None)
def wrapper(link, session=None):
# type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
return fn(link, session=session)

return wrapper


@with_cached_link_fetch
def _get_html_page(link, session=None):
# type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
if session is None:
Expand Down
56 changes: 56 additions & 0 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pip._vendor.six.moves.urllib import request as urllib_request

from pip._internal.index.collector import (
CacheablePageContent,
HTMLPage,
_clean_link,
_determine_base_url,
Expand Down Expand Up @@ -277,6 +278,42 @@ def test_parse_links__yanked_reason(anchor_html, expected):
assert actual == expected


def test_parse_links_caches_same_page():
html = (
# Mark this as a unicode string for Python 2 since anchor_html
# can contain non-ascii.
u'<html><head><meta charset="utf-8"><head>'
'<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
)
html_bytes = html.encode('utf-8')

page_1 = HTMLPage(
html_bytes,
encoding=None,
url='https://example.com/simple/',
)
page_2 = HTMLPage(
html_bytes,
encoding=None,
url='https://example.com/simple/',
)

mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
with mock_parse as mock_parse:
mock_parse.return_value = html5lib.parse(
page_1.content,
transport_encoding=page_1.encoding,
namespaceHTMLElements=False,
)
parsed_links_1 = list(parse_links(page_1))
mock_parse.assert_called()

with mock_parse as mock_parse:
parsed_links_2 = list(parse_links(page_2))
assert parsed_links_2 == parsed_links_1
mock_parse.assert_not_called()


def test_request_http_error(caplog):
caplog.set_level(logging.DEBUG)
link = Link('http://localhost')
Expand Down Expand Up @@ -341,6 +378,25 @@ def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
]


def test_get_html_page_caches_same_link():
link = Link('https://example.com/link-1/')
session = mock.Mock(PipSession)

fake_response = make_fake_html_response(link.url)
mock_func = mock.patch("pip._internal.index.collector._get_html_response")
with mock_func as mock_func:
mock_func.return_value = fake_response
page_1 = _get_html_page(link, session=session)
mock_func.assert_called_once()

with mock_func as mock_func:
page_2 = _get_html_page(link, session=session)
# Assert that the result of the cached html page fetch will also then
# be cached by parse_links() and @with_cached_html_pages.
assert CacheablePageContent(page_1) == CacheablePageContent(page_2)
mock_func.assert_not_called()


def make_fake_html_response(url):
"""
Create a fake requests.Response object.
Expand Down

0 comments on commit 5eb9470

Please sign in to comment.