Cache parse_links() by --find-links html page content (#5)

Cherry-pick of pypa#7729. Fixes pex-tool/pex#887.
pex-tool · Feb 14, 2020 · 5eb9470 · 5eb9470
1 parent e2553a1
commit 5eb9470
Show file tree

Hide file tree

Showing 2 changed files with 128 additions and 2 deletions.
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -24,8 +24,8 @@
 
 if MYPY_CHECK_RUNNING:
     from typing import (
-        Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
-        Union,
+        Any, Callable, Dict, Iterable, List, MutableMapping, Optional,
+        Sequence, Tuple, Union,
     )
     import xml.etree.ElementTree
 
@@ -41,6 +41,32 @@
 logger = logging.getLogger(__name__)
 
 
+def lru_cache(
+        *args,                  # type: Any
+        **kwargs                # type: Any
+):
+    # type: (...) -> Any
+    cache = {}                  # type: Dict[Any, Any]
+
+    def wrapper(fn):
+        # type: (Any) -> Any
+
+        def wrapped(
+                *args,          # type: Any
+                **kwargs        # type: Any
+        ):
+            # type: (...) -> Any
+            cache_key = tuple(args) + tuple(kwargs.items())
+            value = cache.get(cache_key, None)
+            if value is not None:
+                return value
+            value = fn(*args, **kwargs)
+            cache[cache_key] = value
+            return value
+        return wrapped
+    return wrapper
+
+
 def _match_vcs_scheme(url):
     # type: (str) -> Optional[str]
     """Look for VCS schemes in the URL.
@@ -244,6 +270,38 @@ def _create_link_from_element(
     return link
 
 
+class CacheablePageContent(object):
+    def __init__(self, page):
+        # type: (HTMLPage) -> None
+        self.page = page
+
+    def __eq__(self, other):
+        # type: (object) -> bool
+        return (isinstance(other, type(self)) and
+                self.page.content == other.page.content and
+                self.page.encoding == other.page.encoding)
+
+    def __hash__(self):
+        # type: () -> int
+        return hash((self.page.content, self.page.encoding))
+
+
+def with_cached_html_pages(fn):
+    # type: (Any) -> Any
+
+    @lru_cache(maxsize=None)
+    def wrapper(cacheable_page):
+        # type: (CacheablePageContent) -> List[Any]
+        return list(fn(cacheable_page.page))
+
+    def wrapper_wrapper(page):
+        # type: (HTMLPage) -> List[Any]
+        return wrapper(CacheablePageContent(page))
+
+    return wrapper_wrapper
+
+
+@with_cached_html_pages
 def parse_links(page):
     # type: (HTMLPage) -> Iterable[Link]
     """
@@ -308,6 +366,18 @@ def _make_html_page(response):
     return HTMLPage(response.content, encoding=encoding, url=response.url)
 
 
+def with_cached_link_fetch(fn):
+    # type: (Any) -> Any
+
+    @lru_cache(maxsize=None)
+    def wrapper(link, session=None):
+        # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
+        return fn(link, session=session)
+
+    return wrapper
+
+
+@with_cached_link_fetch
 def _get_html_page(link, session=None):
     # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
     if session is None:

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -10,6 +10,7 @@
 from pip._vendor.six.moves.urllib import request as urllib_request
 
 from pip._internal.index.collector import (
+    CacheablePageContent,
     HTMLPage,
     _clean_link,
     _determine_base_url,
@@ -277,6 +278,42 @@ def test_parse_links__yanked_reason(anchor_html, expected):
     assert actual == expected
 
 
+def test_parse_links_caches_same_page():
+    html = (
+        # Mark this as a unicode string for Python 2 since anchor_html
+        # can contain non-ascii.
+        u'<html><head><meta charset="utf-8"><head>'
+        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
+    )
+    html_bytes = html.encode('utf-8')
+
+    page_1 = HTMLPage(
+        html_bytes,
+        encoding=None,
+        url='https://example.com/simple/',
+    )
+    page_2 = HTMLPage(
+        html_bytes,
+        encoding=None,
+        url='https://example.com/simple/',
+    )
+
+    mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
+    with mock_parse as mock_parse:
+        mock_parse.return_value = html5lib.parse(
+            page_1.content,
+            transport_encoding=page_1.encoding,
+            namespaceHTMLElements=False,
+        )
+        parsed_links_1 = list(parse_links(page_1))
+        mock_parse.assert_called()
+
+    with mock_parse as mock_parse:
+        parsed_links_2 = list(parse_links(page_2))
+        assert parsed_links_2 == parsed_links_1
+        mock_parse.assert_not_called()
+
+
 def test_request_http_error(caplog):
     caplog.set_level(logging.DEBUG)
     link = Link('http://localhost')
@@ -341,6 +378,25 @@ def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
     ]
 
 
+def test_get_html_page_caches_same_link():
+    link = Link('https://example.com/link-1/')
+    session = mock.Mock(PipSession)
+
+    fake_response = make_fake_html_response(link.url)
+    mock_func = mock.patch("pip._internal.index.collector._get_html_response")
+    with mock_func as mock_func:
+        mock_func.return_value = fake_response
+        page_1 = _get_html_page(link, session=session)
+        mock_func.assert_called_once()
+
+    with mock_func as mock_func:
+        page_2 = _get_html_page(link, session=session)
+        # Assert that the result of the cached html page fetch will also then
+        # be cached by parse_links() and @with_cached_html_pages.
+        assert CacheablePageContent(page_1) == CacheablePageContent(page_2)
+        mock_func.assert_not_called()
+
+
 def make_fake_html_response(url):
     """
     Create a fake requests.Response object.