Skip to content

Commit

Permalink
fix: handling of rdf:HTML literals (#2490)
Browse files Browse the repository at this point in the history
Previously, without `html5lib` installed, literals with`rdf:HTML`
datatypes were treated as
[ill-typed](https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal),
even if they were not ill-typed.

With this change, if `html5lib` is not installed, literals with the
`rdf:HTML` datatype will not be treated as ill-typed, and will have
`Null` as their `ill_typed` attribute value, which means that it is
unknown whether they are ill-typed or not.

This change also fixes the mapping from `rdf:HTML` literal values to
lexical forms.

Other changes:

- Add tests for `rdflib.NORMALIZE_LITERALS` to ensure it behaves
  correctly.

Related issues:

- Fixes <#2475>
  • Loading branch information
aucampia authored Jul 19, 2023
1 parent 6981c28 commit 588286b
Show file tree
Hide file tree
Showing 7 changed files with 281 additions and 36 deletions.
68 changes: 51 additions & 17 deletions rdflib/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@
from .namespace import NamespaceManager
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath

_HAS_HTML5LIB = False

try:
import html5lib

_HAS_HTML5LIB = True
except ImportError:
html5lib = None

_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1638,20 +1647,34 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802
return retval


def _parseHTML(htmltext: str) -> xml.dom.minidom.DocumentFragment: # noqa: N802
try:
import html5lib
except ImportError:
raise ImportError(
"HTML5 parser not available. Try installing"
+ " html5lib <http://code.google.com/p/html5lib>"
)
def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
"""
Parse the lexical form of an HTML literal into a document fragment
using the ``dom`` from html5lib tree builder.
:param lexical_form: The lexical form of the HTML literal.
:return: A document fragment representing the HTML literal.
:raises: `html5lib.html5parser.ParseError` if the lexical form is
not valid HTML.
"""
parser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
)
retval = parser.parseFragment(htmltext)
retval.normalize()
return retval
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
result.normalize()
return result


def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
"""
Serialize a document fragment representing an HTML literal into
its lexical form.
:param value: A document fragment representing an HTML literal.
:return: The lexical form of the HTML literal.
"""
result = html5lib.serialize(value, tree="dom")
return result


def _writeXML( # noqa: N802
Expand Down Expand Up @@ -1967,14 +1990,21 @@ def _castPythonToLiteral( # noqa: N802
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
# this is a bit dirty - by accident the html5lib parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but it a
# bit random
(xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL)),
(Fraction, (None, _OWL_RATIONAL)),
]

if html5lib is not None:
# This is a bit dirty, by accident the html5lib parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but it a
# bit random.
#
# This must happen before _GenericPythonToXSDRules is assigned to
# _OriginalGenericPythonToXSDRules.
_GenericPythonToXSDRules.append(
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
)

_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)

_SpecificPythonToXSDRules: List[
Expand Down Expand Up @@ -2025,9 +2055,13 @@ def _castPythonToLiteral( # noqa: N802
URIRef(_XSD_PFX + "base64Binary"): b64decode,
URIRef(_XSD_PFX + "anyURI"): None,
_RDF_XMLLITERAL: _parseXML,
_RDF_HTMLLITERAL: _parseHTML,
}

if html5lib is not None:
# It is probably best to keep this close to the definition of
# _GenericPythonToXSDRules so nobody misses it.
XSDToPython[_RDF_HTMLLITERAL] = _parse_html

_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,
Expand Down
7 changes: 3 additions & 4 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import pytest

# This is here so that asserts from these modules are formatted for human
# readibility.
pytest.register_assert_rewrite("test.utils")

from pathlib import Path # noqa: E402
Expand All @@ -19,17 +21,14 @@
Union,
)

from rdflib import Graph
from rdflib import Graph # noqa: E402

from .data import TEST_DATA_DIR
from .utils.earl import EARLReporter # noqa: E402
from .utils.httpservermock import ServedBaseHTTPServerMock # noqa: E402

pytest_plugins = [EARLReporter.__module__]

# This is here so that asserts from these modules are formatted for human
# readibility.


@pytest.fixture(scope="session")
def http_file_server() -> Generator[HTTPFileServer, None, None]:
Expand Down
103 changes: 95 additions & 8 deletions test/test_literal/test_literal.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
from __future__ import annotations

import builtins
import datetime
import logging
from decimal import Decimal
from test.utils import affix_tuples
from test.utils.literal import LiteralChecker, literal_idfn
from test.utils.namespace import EGDC
from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
from typing import Any, Callable, Generator, Optional, Type, Union

# NOTE: The config below enables strict mode for mypy.
# mypy: no_ignore_errors
# mypy: warn_unused_configs, disallow_any_generics
Expand All @@ -7,14 +19,13 @@
# mypy: no_implicit_optional, warn_redundant_casts, warn_unused_ignores
# mypy: warn_return_any, no_implicit_reexport, strict_equality

import datetime
import logging
from decimal import Decimal
from test.utils import affix_tuples
from test.utils.literal import LiteralChecker
from test.utils.namespace import EGDC
from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
from typing import Any, Callable, Generator, Optional, Type, Union

try:
import html5lib as _ # noqa: F401

_HAVE_HTML5LIB = True
except ImportError:
_HAVE_HTML5LIB = False

import isodate
import pytest
Expand Down Expand Up @@ -915,6 +926,21 @@ def unlexify(s: str) -> str:
)


class _UnknownType:
"""
A class that is not known to rdflib, used to test the how
rdflib.term.Literal handles unknown python types.
"""

def __repr__(self) -> str:
return "_UnknownType()"

def __eq__(self, __value: object) -> bool:
if isinstance(__value, _UnknownType):
return True
return False


@pytest.mark.parametrize(
["literal_maker", "outcome"],
[
Expand Down Expand Up @@ -951,7 +977,30 @@ def unlexify(s: str) -> str:
lambda: Literal(Literal("blue sky", "en")),
Literal("blue sky", "en"),
),
(
lambda: Literal("<body>", datatype=RDF.HTML),
LiteralChecker(
..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
),
),
(
lambda: Literal("<table></table>", datatype=RDF.HTML),
LiteralChecker(
...,
None,
RDF.HTML,
False if _HAVE_HTML5LIB else None,
"<table></table>",
),
),
(
lambda: Literal(_UnknownType(), datatype=EGDC.UnknownType),
LiteralChecker(
_UnknownType(), None, EGDC.UnknownType, None, "_UnknownType()"
),
),
],
ids=literal_idfn,
)
def test_literal_construction(
literal_maker: Callable[[], Literal],
Expand All @@ -961,3 +1010,41 @@ def test_literal_construction(
with checker.context():
actual_outcome = literal_maker()
checker.check(actual_outcome)


@pytest.mark.parametrize(
["literal_maker", "normalize_literals", "outcome"],
[
(
lambda: Literal("001000", datatype=XSD.integer),
...,
LiteralChecker(1000, None, XSD.integer, False, "1000"),
),
(
lambda: Literal("001000", datatype=XSD.integer),
True,
LiteralChecker(1000, None, XSD.integer, False, "1000"),
),
(
lambda: Literal("001000", datatype=XSD.integer),
False,
LiteralChecker(1000, None, XSD.integer, False, "001000"),
),
],
ids=literal_idfn,
)
def test_global_normalize(
literal_maker: Callable[[], Literal],
normalize_literals: Union[builtins.ellipsis, bool],
outcome: OutcomePrimitives[Literal],
) -> None:
_normalize_literals = rdflib.NORMALIZE_LITERALS
try:
if normalize_literals is not ...:
rdflib.NORMALIZE_LITERALS = normalize_literals
checker = OutcomeChecker[Literal].from_primitives(outcome)
with checker.context():
actual_outcome = literal_maker()
checker.check(actual_outcome)
finally:
rdflib.NORMALIZE_LITERALS = _normalize_literals
79 changes: 79 additions & 0 deletions test/test_literal/test_literal_html5lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import xml.dom.minidom
from test.utils.literal import LiteralChecker
from test.utils.outcome import OutcomeChecker, OutcomePrimitives
from typing import Callable

import pytest

import rdflib.term
from rdflib.namespace import RDF
from rdflib.term import Literal

try:
import html5lib as _ # noqa: F401
except ImportError:
pytest.skip("html5lib not installed", allow_module_level=True)


def test_has_html5lib() -> None:
assert rdflib.term._HAS_HTML5LIB is True
assert RDF.HTML in rdflib.term.XSDToPython
rule = next(
(
item
for item in rdflib.term._GenericPythonToXSDRules
if item[0] is xml.dom.minidom.DocumentFragment
),
None,
)
assert rule is not None
assert rule[1][1] == RDF.HTML


@pytest.mark.parametrize(
["factory", "outcome"],
[
# Ill-typed literals, these have lexical forms that result in
# errors when parsed as HTML by html5lib.
(
lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
LiteralChecker(
..., None, RDF.HTML, True, "<body><h1>Hello, World!</h1></body>"
),
),
(
lambda: Literal("<body></body>", datatype=RDF.HTML),
LiteralChecker(..., None, RDF.HTML, True, "<body></body>"),
),
(
lambda: Literal("<tr><td>THE TEXT IS IN HERE</td></tr>", datatype=RDF.HTML),
LiteralChecker(
..., None, RDF.HTML, True, "<tr><td>THE TEXT IS IN HERE</td></tr>"
),
),
# Well-typed literals, these have lexical forms that parse
# without errors with html5lib.
(
lambda: Literal("<table></table>", datatype=RDF.HTML),
LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),
),
(
lambda: Literal(" <table> </table> ", datatype=RDF.HTML, normalize=True),
LiteralChecker(..., None, RDF.HTML, False, " <table> </table> "),
),
(
lambda: Literal(
" <table> </table> ", datatype=RDF.HTML, normalize=False
),
LiteralChecker(..., None, RDF.HTML, False, " <table> </table> "),
),
],
)
def test_literal_construction(
factory: Callable[[], Literal],
outcome: OutcomePrimitives[Literal],
) -> None:
checker = OutcomeChecker[Literal].from_primitives(outcome)
with checker.context():
actual_outcome = factory()
checker.check(actual_outcome)
17 changes: 17 additions & 0 deletions test/test_sparql/test_sparql.py
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,23 @@ def thrower(*args: Any, **kwargs: Any) -> None:
],
id="select-group-concat-optional-many",
),
pytest.param(
"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT * WHERE {
BIND(STRDT("<body>", rdf:HTML) as ?tag1) # incorrectly disappearing literal
BIND("<body>" as ?tag2) # correctly appearing literal
}
""",
[
{
Variable("tag1"): Literal("<body>", datatype=RDF.HTML),
Variable("tag2"): Literal("<body>"),
}
],
id="select-bind-strdt-html",
),
],
)
def test_queries(
Expand Down
Loading

0 comments on commit 588286b

Please sign in to comment.