Skip to content

Commit 588286b

Browse files
authored
fix: handling of rdf:HTML literals (#2490)
Previously, without `html5lib` installed, literals with`rdf:HTML` datatypes were treated as [ill-typed](https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal), even if they were not ill-typed. With this change, if `html5lib` is not installed, literals with the `rdf:HTML` datatype will not be treated as ill-typed, and will have `Null` as their `ill_typed` attribute value, which means that it is unknown whether they are ill-typed or not. This change also fixes the mapping from `rdf:HTML` literal values to lexical forms. Other changes: - Add tests for `rdflib.NORMALIZE_LITERALS` to ensure it behaves correctly. Related issues: - Fixes <#2475>
1 parent 6981c28 commit 588286b

File tree

7 files changed

+281
-36
lines changed

7 files changed

+281
-36
lines changed

rdflib/term.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,15 @@
7676
from .namespace import NamespaceManager
7777
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath
7878

79+
_HAS_HTML5LIB = False
80+
81+
try:
82+
import html5lib
83+
84+
_HAS_HTML5LIB = True
85+
except ImportError:
86+
html5lib = None
87+
7988
_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
8089

8190
logger = logging.getLogger(__name__)
@@ -1638,20 +1647,34 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802
16381647
return retval
16391648

16401649

1641-
def _parseHTML(htmltext: str) -> xml.dom.minidom.DocumentFragment: # noqa: N802
1642-
try:
1643-
import html5lib
1644-
except ImportError:
1645-
raise ImportError(
1646-
"HTML5 parser not available. Try installing"
1647-
+ " html5lib <http://code.google.com/p/html5lib>"
1648-
)
1650+
def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
1651+
"""
1652+
Parse the lexical form of an HTML literal into a document fragment
1653+
using the ``dom`` from html5lib tree builder.
1654+
1655+
:param lexical_form: The lexical form of the HTML literal.
1656+
:return: A document fragment representing the HTML literal.
1657+
:raises: `html5lib.html5parser.ParseError` if the lexical form is
1658+
not valid HTML.
1659+
"""
16491660
parser = html5lib.HTMLParser(
16501661
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
16511662
)
1652-
retval = parser.parseFragment(htmltext)
1653-
retval.normalize()
1654-
return retval
1663+
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
1664+
result.normalize()
1665+
return result
1666+
1667+
1668+
def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
1669+
"""
1670+
Serialize a document fragment representing an HTML literal into
1671+
its lexical form.
1672+
1673+
:param value: A document fragment representing an HTML literal.
1674+
:return: The lexical form of the HTML literal.
1675+
"""
1676+
result = html5lib.serialize(value, tree="dom")
1677+
return result
16551678

16561679

16571680
def _writeXML( # noqa: N802
@@ -1967,14 +1990,21 @@ def _castPythonToLiteral( # noqa: N802
19671990
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
19681991
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
19691992
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
1970-
# this is a bit dirty - by accident the html5lib parser produces
1971-
# DocumentFragments, and the xml parser Documents, letting this
1972-
# decide what datatype to use makes roundtripping easier, but it a
1973-
# bit random
1974-
(xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL)),
19751993
(Fraction, (None, _OWL_RATIONAL)),
19761994
]
19771995

1996+
if html5lib is not None:
1997+
# This is a bit dirty, by accident the html5lib parser produces
1998+
# DocumentFragments, and the xml parser Documents, letting this
1999+
# decide what datatype to use makes roundtripping easier, but it a
2000+
# bit random.
2001+
#
2002+
# This must happen before _GenericPythonToXSDRules is assigned to
2003+
# _OriginalGenericPythonToXSDRules.
2004+
_GenericPythonToXSDRules.append(
2005+
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
2006+
)
2007+
19782008
_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)
19792009

19802010
_SpecificPythonToXSDRules: List[
@@ -2025,9 +2055,13 @@ def _castPythonToLiteral( # noqa: N802
20252055
URIRef(_XSD_PFX + "base64Binary"): b64decode,
20262056
URIRef(_XSD_PFX + "anyURI"): None,
20272057
_RDF_XMLLITERAL: _parseXML,
2028-
_RDF_HTMLLITERAL: _parseHTML,
20292058
}
20302059

2060+
if html5lib is not None:
2061+
# It is probably best to keep this close to the definition of
2062+
# _GenericPythonToXSDRules so nobody misses it.
2063+
XSDToPython[_RDF_HTMLLITERAL] = _parse_html
2064+
20312065
_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
20322066
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
20332067
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,

test/conftest.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import pytest
55

6+
# This is here so that asserts from these modules are formatted for human
7+
# readibility.
68
pytest.register_assert_rewrite("test.utils")
79

810
from pathlib import Path # noqa: E402
@@ -19,17 +21,14 @@
1921
Union,
2022
)
2123

22-
from rdflib import Graph
24+
from rdflib import Graph # noqa: E402
2325

2426
from .data import TEST_DATA_DIR
2527
from .utils.earl import EARLReporter # noqa: E402
2628
from .utils.httpservermock import ServedBaseHTTPServerMock # noqa: E402
2729

2830
pytest_plugins = [EARLReporter.__module__]
2931

30-
# This is here so that asserts from these modules are formatted for human
31-
# readibility.
32-
3332

3433
@pytest.fixture(scope="session")
3534
def http_file_server() -> Generator[HTTPFileServer, None, None]:

test/test_literal/test_literal.py

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
from __future__ import annotations
2+
3+
import builtins
4+
import datetime
5+
import logging
6+
from decimal import Decimal
7+
from test.utils import affix_tuples
8+
from test.utils.literal import LiteralChecker, literal_idfn
9+
from test.utils.namespace import EGDC
10+
from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
11+
from typing import Any, Callable, Generator, Optional, Type, Union
12+
113
# NOTE: The config below enables strict mode for mypy.
214
# mypy: no_ignore_errors
315
# mypy: warn_unused_configs, disallow_any_generics
@@ -7,14 +19,13 @@
719
# mypy: no_implicit_optional, warn_redundant_casts, warn_unused_ignores
820
# mypy: warn_return_any, no_implicit_reexport, strict_equality
921

10-
import datetime
11-
import logging
12-
from decimal import Decimal
13-
from test.utils import affix_tuples
14-
from test.utils.literal import LiteralChecker
15-
from test.utils.namespace import EGDC
16-
from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
17-
from typing import Any, Callable, Generator, Optional, Type, Union
22+
23+
try:
24+
import html5lib as _ # noqa: F401
25+
26+
_HAVE_HTML5LIB = True
27+
except ImportError:
28+
_HAVE_HTML5LIB = False
1829

1930
import isodate
2031
import pytest
@@ -915,6 +926,21 @@ def unlexify(s: str) -> str:
915926
)
916927

917928

929+
class _UnknownType:
930+
"""
931+
A class that is not known to rdflib, used to test the how
932+
rdflib.term.Literal handles unknown python types.
933+
"""
934+
935+
def __repr__(self) -> str:
936+
return "_UnknownType()"
937+
938+
def __eq__(self, __value: object) -> bool:
939+
if isinstance(__value, _UnknownType):
940+
return True
941+
return False
942+
943+
918944
@pytest.mark.parametrize(
919945
["literal_maker", "outcome"],
920946
[
@@ -951,7 +977,30 @@ def unlexify(s: str) -> str:
951977
lambda: Literal(Literal("blue sky", "en")),
952978
Literal("blue sky", "en"),
953979
),
980+
(
981+
lambda: Literal("<body>", datatype=RDF.HTML),
982+
LiteralChecker(
983+
..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
984+
),
985+
),
986+
(
987+
lambda: Literal("<table></table>", datatype=RDF.HTML),
988+
LiteralChecker(
989+
...,
990+
None,
991+
RDF.HTML,
992+
False if _HAVE_HTML5LIB else None,
993+
"<table></table>",
994+
),
995+
),
996+
(
997+
lambda: Literal(_UnknownType(), datatype=EGDC.UnknownType),
998+
LiteralChecker(
999+
_UnknownType(), None, EGDC.UnknownType, None, "_UnknownType()"
1000+
),
1001+
),
9541002
],
1003+
ids=literal_idfn,
9551004
)
9561005
def test_literal_construction(
9571006
literal_maker: Callable[[], Literal],
@@ -961,3 +1010,41 @@ def test_literal_construction(
9611010
with checker.context():
9621011
actual_outcome = literal_maker()
9631012
checker.check(actual_outcome)
1013+
1014+
1015+
@pytest.mark.parametrize(
1016+
["literal_maker", "normalize_literals", "outcome"],
1017+
[
1018+
(
1019+
lambda: Literal("001000", datatype=XSD.integer),
1020+
...,
1021+
LiteralChecker(1000, None, XSD.integer, False, "1000"),
1022+
),
1023+
(
1024+
lambda: Literal("001000", datatype=XSD.integer),
1025+
True,
1026+
LiteralChecker(1000, None, XSD.integer, False, "1000"),
1027+
),
1028+
(
1029+
lambda: Literal("001000", datatype=XSD.integer),
1030+
False,
1031+
LiteralChecker(1000, None, XSD.integer, False, "001000"),
1032+
),
1033+
],
1034+
ids=literal_idfn,
1035+
)
1036+
def test_global_normalize(
1037+
literal_maker: Callable[[], Literal],
1038+
normalize_literals: Union[builtins.ellipsis, bool],
1039+
outcome: OutcomePrimitives[Literal],
1040+
) -> None:
1041+
_normalize_literals = rdflib.NORMALIZE_LITERALS
1042+
try:
1043+
if normalize_literals is not ...:
1044+
rdflib.NORMALIZE_LITERALS = normalize_literals
1045+
checker = OutcomeChecker[Literal].from_primitives(outcome)
1046+
with checker.context():
1047+
actual_outcome = literal_maker()
1048+
checker.check(actual_outcome)
1049+
finally:
1050+
rdflib.NORMALIZE_LITERALS = _normalize_literals
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import xml.dom.minidom
2+
from test.utils.literal import LiteralChecker
3+
from test.utils.outcome import OutcomeChecker, OutcomePrimitives
4+
from typing import Callable
5+
6+
import pytest
7+
8+
import rdflib.term
9+
from rdflib.namespace import RDF
10+
from rdflib.term import Literal
11+
12+
try:
13+
import html5lib as _ # noqa: F401
14+
except ImportError:
15+
pytest.skip("html5lib not installed", allow_module_level=True)
16+
17+
18+
def test_has_html5lib() -> None:
19+
assert rdflib.term._HAS_HTML5LIB is True
20+
assert RDF.HTML in rdflib.term.XSDToPython
21+
rule = next(
22+
(
23+
item
24+
for item in rdflib.term._GenericPythonToXSDRules
25+
if item[0] is xml.dom.minidom.DocumentFragment
26+
),
27+
None,
28+
)
29+
assert rule is not None
30+
assert rule[1][1] == RDF.HTML
31+
32+
33+
@pytest.mark.parametrize(
34+
["factory", "outcome"],
35+
[
36+
# Ill-typed literals, these have lexical forms that result in
37+
# errors when parsed as HTML by html5lib.
38+
(
39+
lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
40+
LiteralChecker(
41+
..., None, RDF.HTML, True, "<body><h1>Hello, World!</h1></body>"
42+
),
43+
),
44+
(
45+
lambda: Literal("<body></body>", datatype=RDF.HTML),
46+
LiteralChecker(..., None, RDF.HTML, True, "<body></body>"),
47+
),
48+
(
49+
lambda: Literal("<tr><td>THE TEXT IS IN HERE</td></tr>", datatype=RDF.HTML),
50+
LiteralChecker(
51+
..., None, RDF.HTML, True, "<tr><td>THE TEXT IS IN HERE</td></tr>"
52+
),
53+
),
54+
# Well-typed literals, these have lexical forms that parse
55+
# without errors with html5lib.
56+
(
57+
lambda: Literal("<table></table>", datatype=RDF.HTML),
58+
LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),
59+
),
60+
(
61+
lambda: Literal(" <table> </table> ", datatype=RDF.HTML, normalize=True),
62+
LiteralChecker(..., None, RDF.HTML, False, " <table> </table> "),
63+
),
64+
(
65+
lambda: Literal(
66+
" <table> </table> ", datatype=RDF.HTML, normalize=False
67+
),
68+
LiteralChecker(..., None, RDF.HTML, False, " <table> </table> "),
69+
),
70+
],
71+
)
72+
def test_literal_construction(
73+
factory: Callable[[], Literal],
74+
outcome: OutcomePrimitives[Literal],
75+
) -> None:
76+
checker = OutcomeChecker[Literal].from_primitives(outcome)
77+
with checker.context():
78+
actual_outcome = factory()
79+
checker.check(actual_outcome)

test/test_sparql/test_sparql.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,23 @@ def thrower(*args: Any, **kwargs: Any) -> None:
844844
],
845845
id="select-group-concat-optional-many",
846846
),
847+
pytest.param(
848+
"""
849+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
850+
851+
SELECT * WHERE {
852+
BIND(STRDT("<body>", rdf:HTML) as ?tag1) # incorrectly disappearing literal
853+
BIND("<body>" as ?tag2) # correctly appearing literal
854+
}
855+
""",
856+
[
857+
{
858+
Variable("tag1"): Literal("<body>", datatype=RDF.HTML),
859+
Variable("tag2"): Literal("<body>"),
860+
}
861+
],
862+
id="select-bind-strdt-html",
863+
),
847864
],
848865
)
849866
def test_queries(

0 commit comments

Comments
 (0)