Skip to content

Commit

Permalink
link_sources: speed up resolution for legacy repositories
Browse files Browse the repository at this point in the history
This commit changes the data structure used to hold links parsed from a legacy repository page so that they can be accessed by package name and version. By doing this, it saves iterating through lists of links over and over again, usually the same list of links multiple times as repositories, such as PyPI and AWS CodeArtifact, will list links for all published versions on a single page
  • Loading branch information
MasterNayru committed Sep 7, 2022
1 parent 28d0175 commit 1f27951
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 30 deletions.
39 changes: 24 additions & 15 deletions src/poetry/repositories/link_sources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,33 @@ def versions(self, name: str) -> Iterator[Version]:
name = canonicalize_name(name)
seen: set[Version] = set()

for link in self.links:
pkg = self.link_package_data(link)
if not self.links or name not in self.links:
return []

if pkg and pkg.name == name and pkg.version not in seen:
seen.add(pkg.version)
yield pkg.version
for version, links in self.links[name].items():
for link in links:
if not link:
continue

pkg = self.link_package_data(link)

if pkg and pkg.version not in seen:
seen.add(pkg.version)
yield pkg.version

@property
def packages(self) -> Iterator[Package]:
for link in self.links:
pkg = self.link_package_data(link)
for pkg_name, versions in self.links.items():
for version, links in versions.items():
for link in links:
pkg = self.link_package_data(link)

if pkg:
yield pkg
if pkg:
yield pkg

@property
@abstractmethod
def links(self) -> Iterator[Link]:
def links(self) -> Dict[Link]:
raise NotImplementedError()

@classmethod
Expand Down Expand Up @@ -101,12 +110,12 @@ def link_package_data(cls, link: Link) -> Package | None:

def links_for_version(
self, name: NormalizedName, version: Version
) -> Iterator[Link]:
for link in self.links:
pkg = self.link_package_data(link)
) -> [Link]:
version = str(version)
if name not in self.links or version not in self.links[name]:
return []

if pkg and pkg.name == name and pkg.version == version:
yield link
return self.links[name][version]

def clean_link(self, url: str) -> str:
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
Expand Down
29 changes: 25 additions & 4 deletions src/poetry/repositories/link_sources/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
import warnings

from html import unescape
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Dict

from poetry.core.packages.utils.link import Link

from poetry.repositories.link_sources.base import LinkSource


if TYPE_CHECKING:
from collections.abc import Iterator

Expand All @@ -24,9 +23,15 @@ def __init__(self, url: str, content: str) -> None:
super().__init__(url=url)

self._parsed = html5lib.parse(content, namespaceHTMLElements=False)
self.links_cache = None

@property
def links(self) -> Iterator[Link]:
def links(self) -> Dict[Link]:
if self.links_cache:
return self.links_cache

links = {}

for anchor in self._parsed.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
Expand All @@ -44,8 +49,24 @@ def links(self) -> Iterator[Link]:
if link.ext not in self.SUPPORTED_FORMATS:
continue

yield link
pkg = self.link_package_data(link)

if not hasattr(pkg, "name"):
continue

if pkg.name not in links:
links[pkg.name] = {}

if not hasattr(pkg, "version"):
continue

if str(pkg.version) not in links[pkg.name]:
links[pkg.name][str(pkg.version)] = []

links[pkg.name][str(pkg.version)].append(link)

self.links_cache = links
return links

class SimpleRepositoryPage(HTMLPage):
def __init__(self, url: str, content: str) -> None:
Expand Down
20 changes: 11 additions & 9 deletions tests/repositories/link_sources/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,17 @@ def link_source(mocker: MockerFixture) -> LinkSource:
mocker.patch(
f"{LinkSource.__module__}.{LinkSource.__qualname__}.links",
new_callable=PropertyMock,
return_value=iter(
[
Link(f"{url}/demo-0.1.0.tar.gz"),
Link(f"{url}/demo-0.1.0_invalid.tar.gz"),
Link(f"{url}/invalid.tar.gz"),
Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"),
Link(f"{url}/demo-0.1.1.tar.gz"),
]
),
return_value={
"demo": {
"0.1.0": [
Link(f"{url}/demo-0.1.0.tar.gz"),
Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"),
],
"0.1.1": [
Link(f"{url}/demo-0.1.1.tar.gz"),
]
}
}
)
return link_source

Expand Down
6 changes: 4 additions & 2 deletions tests/repositories/link_sources/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,16 @@
],
)
def test_link_attributes(attributes: str, expected_link: Link) -> None:
pkg_name = "demo"
pkg_version = "0.1"
anchor = (
f'<a href="https://example.org/demo-0.1.whl" {attributes}>demo-0.1.whl</a><br/>'
)
content = DEMO_TEMPLATE.format(anchor)
page = HTMLPage("https://example.org", content)

assert len(list(page.links)) == 1
link = list(page.links)[0]
assert len(list(page.links[pkg_name][pkg_version])) == 1
link = list(page.links[pkg_name][pkg_version])[0]
assert link.url == expected_link.url
assert link.requires_python == expected_link.requires_python
assert link.yanked == expected_link.yanked
Expand Down

0 comments on commit 1f27951

Please sign in to comment.