-
Notifications
You must be signed in to change notification settings - Fork 561
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: handling of
rdf:HTML
literals (#2490)
Previously, without `html5lib` installed, literals with`rdf:HTML` datatypes were treated as [ill-typed](https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal), even if they were not ill-typed. With this change, if `html5lib` is not installed, literals with the `rdf:HTML` datatype will not be treated as ill-typed, and will have `Null` as their `ill_typed` attribute value, which means that it is unknown whether they are ill-typed or not. This change also fixes the mapping from `rdf:HTML` literal values to lexical forms. Other changes: - Add tests for `rdflib.NORMALIZE_LITERALS` to ensure it behaves correctly. Related issues: - Fixes <#2475>
- Loading branch information
Showing
7 changed files
with
281 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import xml.dom.minidom | ||
from test.utils.literal import LiteralChecker | ||
from test.utils.outcome import OutcomeChecker, OutcomePrimitives | ||
from typing import Callable | ||
|
||
import pytest | ||
|
||
import rdflib.term | ||
from rdflib.namespace import RDF | ||
from rdflib.term import Literal | ||
|
||
try: | ||
import html5lib as _ # noqa: F401 | ||
except ImportError: | ||
pytest.skip("html5lib not installed", allow_module_level=True) | ||
|
||
|
||
def test_has_html5lib() -> None: | ||
assert rdflib.term._HAS_HTML5LIB is True | ||
assert RDF.HTML in rdflib.term.XSDToPython | ||
rule = next( | ||
( | ||
item | ||
for item in rdflib.term._GenericPythonToXSDRules | ||
if item[0] is xml.dom.minidom.DocumentFragment | ||
), | ||
None, | ||
) | ||
assert rule is not None | ||
assert rule[1][1] == RDF.HTML | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["factory", "outcome"], | ||
[ | ||
# Ill-typed literals, these have lexical forms that result in | ||
# errors when parsed as HTML by html5lib. | ||
( | ||
lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML), | ||
LiteralChecker( | ||
..., None, RDF.HTML, True, "<body><h1>Hello, World!</h1></body>" | ||
), | ||
), | ||
( | ||
lambda: Literal("<body></body>", datatype=RDF.HTML), | ||
LiteralChecker(..., None, RDF.HTML, True, "<body></body>"), | ||
), | ||
( | ||
lambda: Literal("<tr><td>THE TEXT IS IN HERE</td></tr>", datatype=RDF.HTML), | ||
LiteralChecker( | ||
..., None, RDF.HTML, True, "<tr><td>THE TEXT IS IN HERE</td></tr>" | ||
), | ||
), | ||
# Well-typed literals, these have lexical forms that parse | ||
# without errors with html5lib. | ||
( | ||
lambda: Literal("<table></table>", datatype=RDF.HTML), | ||
LiteralChecker(..., None, RDF.HTML, False, "<table></table>"), | ||
), | ||
( | ||
lambda: Literal(" <table> </table> ", datatype=RDF.HTML, normalize=True), | ||
LiteralChecker(..., None, RDF.HTML, False, " <table> </table> "), | ||
), | ||
( | ||
lambda: Literal( | ||
" <table> </table> ", datatype=RDF.HTML, normalize=False | ||
), | ||
LiteralChecker(..., None, RDF.HTML, False, " <table> </table> "), | ||
), | ||
], | ||
) | ||
def test_literal_construction( | ||
factory: Callable[[], Literal], | ||
outcome: OutcomePrimitives[Literal], | ||
) -> None: | ||
checker = OutcomeChecker[Literal].from_primitives(outcome) | ||
with checker.context(): | ||
actual_outcome = factory() | ||
checker.check(actual_outcome) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.