Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Add stubs package for lxml. (#15697)
Browse files Browse the repository at this point in the history
The stubs have some issues so this has some generous cast
and ignores in it, but it is better than not having stubs.

Note that confusing that Element is a function which creates
_Element instances (and similarly for Comment).
  • Loading branch information
clokep authored May 31, 2023
1 parent 874378c commit 6f18812
Show file tree
Hide file tree
Showing 10 changed files with 117 additions and 48 deletions.
1 change: 1 addition & 0 deletions changelog.d/15697.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve type hints.
3 changes: 0 additions & 3 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ ignore_missing_imports = True
[mypy-ijson.*]
ignore_missing_imports = True

[mypy-lxml]
ignore_missing_imports = True

# https://github.com/msgpack/msgpack-python/issues/448
[mypy-msgpack]
ignore_missing_imports = True
Expand Down
25 changes: 20 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ black = ">=22.3.0"
ruff = "0.0.265"

# Typechecking
lxml-stubs = ">=0.4.0"
mypy = "*"
mypy-zope = "*"
types-bleach = ">=4.1.0"
Expand Down
32 changes: 19 additions & 13 deletions synapse/media/oembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import html
import logging
import urllib.parse
from typing import TYPE_CHECKING, List, Optional
from typing import TYPE_CHECKING, List, Optional, cast

import attr

Expand Down Expand Up @@ -98,7 +98,7 @@ def get_oembed_url(self, url: str) -> Optional[str]:
# No match.
return None

def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
def autodiscover_from_html(self, tree: "etree._Element") -> Optional[str]:
"""
Search an HTML document for oEmbed autodiscovery information.
Expand All @@ -109,18 +109,22 @@ def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
for tag in tree.xpath(
"//link[@rel='alternate'][@type='application/json+oembed']"
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(
List["etree._Element"],
tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return tag.attrib["href"]
return cast(str, tag.attrib["href"])

# Some providers (e.g. Flickr) use alternative instead of alternate.
for tag in tree.xpath(
"//link[@rel='alternative'][@type='application/json+oembed']"
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(
List["etree._Element"],
tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return tag.attrib["href"]
return cast(str, tag.attrib["href"])

return None

Expand Down Expand Up @@ -212,11 +216,12 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
return OEmbedResult(open_graph_response, author_name, cache_age)


def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
def _fetch_urls(tree: "etree._Element", tag_name: str) -> List[str]:
results = []
for tag in tree.xpath("//*/" + tag_name):
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(List["etree._Element"], tree.xpath("//*/" + tag_name)):
if "src" in tag.attrib:
results.append(tag.attrib["src"])
results.append(cast(str, tag.attrib["src"]))
return results


Expand Down Expand Up @@ -244,11 +249,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
parser = etree.HTMLParser(recover=True, encoding="utf-8")

# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(html_body, parser)
# TODO Develop of lxml-stubs has this correct.
tree = etree.fromstring(html_body, parser) # type: ignore[arg-type]

# The data was successfully parsed, but no tree was found.
if tree is None:
return
return # type: ignore[unreachable]

# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
Expand Down
79 changes: 56 additions & 23 deletions synapse/media/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
Optional,
Set,
Union,
cast,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -115,7 +116,7 @@ def _get_html_media_encodings(

def decode_body(
body: bytes, uri: str, content_type: Optional[str] = None
) -> Optional["etree.Element"]:
) -> Optional["etree._Element"]:
"""
This uses lxml to parse the HTML document.
Expand Down Expand Up @@ -152,11 +153,12 @@ def decode_body(

# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body, parser)
# TODO Develop of lxml-stubs has this correct.
return etree.fromstring(body, parser) # type: ignore[arg-type]


def _get_meta_tags(
tree: "etree.Element",
tree: "etree._Element",
property: str,
prefix: str,
property_mapper: Optional[Callable[[str], Optional[str]]] = None,
Expand All @@ -175,9 +177,15 @@ def _get_meta_tags(
Returns:
A map of tag name to value.
"""
# This actually returns Dict[str, str], but the caller sets this as a variable
# which is Dict[str, Optional[str]].
results: Dict[str, Optional[str]] = {}
for tag in tree.xpath(
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(
List["etree._Element"],
tree.xpath(
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
),
):
# if we've got more than 50 tags, someone is taking the piss
if len(results) >= 50:
Expand All @@ -187,14 +195,15 @@ def _get_meta_tags(
)
return {}

key = tag.attrib[property]
key = cast(str, tag.attrib[property])
if property_mapper:
key = property_mapper(key)
new_key = property_mapper(key)
# None is a special value used to ignore a value.
if key is None:
if new_key is None:
continue
key = new_key

results[key] = tag.attrib["content"]
results[key] = cast(str, tag.attrib["content"])

return results

Expand All @@ -219,7 +228,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]:
return "og" + key[7:]


def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
def parse_html_to_open_graph(tree: "etree._Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
Expand Down Expand Up @@ -276,24 +285,36 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:

if "og:title" not in og:
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
title = cast(
List["etree._ElementUnicodeResult"],
tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"),
)
if title:
og["og:title"] = title[0].strip()
else:
og["og:title"] = None

if "og:image" not in og:
meta_image = tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
meta_image = cast(
List["etree._ElementUnicodeResult"],
tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
),
)
# If a meta image is found, use it.
if meta_image:
og["og:image"] = meta_image[0]
else:
# Try to find images which are larger than 10px by 10px.
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
#
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
images = cast(
List["etree._Element"],
tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"),
)
images = sorted(
images,
key=lambda i: (
Expand All @@ -302,20 +323,29 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
)
# If no images were found, try to find *any* images.
if not images:
images = tree.xpath("//img[@src][1]")
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
images = cast(List["etree._Element"], tree.xpath("//img[@src][1]"))
if images:
og["og:image"] = images[0].attrib["src"]
og["og:image"] = cast(str, images[0].attrib["src"])

# Finally, fallback to the favicon if nothing else.
else:
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
favicons = cast(
List["etree._ElementUnicodeResult"],
tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"),
)
if favicons:
og["og:image"] = favicons[0]

if "og:description" not in og:
# Check the first meta description tag for content.
meta_description = tree.xpath(
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
meta_description = cast(
List["etree._ElementUnicodeResult"],
tree.xpath(
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
),
)
# If a meta description is found with content, use it.
if meta_description:
Expand All @@ -332,7 +362,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
return og


def parse_html_description(tree: "etree.Element") -> Optional[str]:
def parse_html_description(tree: "etree._Element") -> Optional[str]:
"""
Calculate a text description based on an HTML document.
Expand Down Expand Up @@ -368,6 +398,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
"canvas",
"img",
"picture",
# etree.Comment is a function which creates an etree._Comment element.
# The "tag" attribute of an etree._Comment instance is confusingly the
# etree.Comment function instead of a string.
etree.Comment,
}

Expand All @@ -381,8 +414,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:


def _iterate_over_text(
tree: Optional["etree.Element"],
tags_to_ignore: Set[Union[str, "etree.Comment"]],
tree: Optional["etree._Element"],
tags_to_ignore: Set[object],
stack_limit: int = 1024,
) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion,
Expand All @@ -402,7 +435,7 @@ def _iterate_over_text(

# This is a stack whose items are elements to iterate over *or* strings
# to be returned.
elements: List[Union[str, "etree.Element"]] = [tree]
elements: List[Union[str, "etree._Element"]] = [tree]
while elements:
el = elements.pop()

Expand Down
Loading

0 comments on commit 6f18812

Please sign in to comment.