fix: handling of rdf:HTML literals (#2490)

aucampia · web-flow · commit 588286bfb342 · 2023-07-19T19:55:53.000+02:00
Previously, without `html5lib` installed, literals with`rdf:HTML` datatypes were treated as [ill-typed](https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal), even if they were not ill-typed. With this change, if `html5lib` is not installed, literals with the `rdf:HTML` datatype will not be treated as ill-typed, and will have `Null` as their `ill_typed` attribute value, which means that it is unknown whether they are ill-typed or not. This change also fixes the mapping from `rdf:HTML` literal values to lexical forms. Other changes: - Add tests for `rdflib.NORMALIZE_LITERALS` to ensure it behaves correctly. Related issues: - Fixes <#2475>
diff --git a/rdflib/term.py b/rdflib/term.py
@@ -76,6 +76,15 @@
     from .namespace import NamespaceManager
     from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath
 
+_HAS_HTML5LIB = False
+
+try:
+    import html5lib
+
+    _HAS_HTML5LIB = True
+except ImportError:
+    html5lib = None
+
 _SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
 
 logger = logging.getLogger(__name__)
@@ -1638,20 +1647,34 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document:  # noqa: N802
     return retval
 
 
-def _parseHTML(htmltext: str) -> xml.dom.minidom.DocumentFragment:  # noqa: N802
-    try:
-        import html5lib
-    except ImportError:
-        raise ImportError(
-            "HTML5 parser not available. Try installing"
-            + " html5lib <http://code.google.com/p/html5lib>"
-        )
+def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
+    """
+    Parse the lexical form of an HTML literal into a document fragment
+    using the ``dom`` from html5lib tree builder.
+
+    :param lexical_form: The lexical form of the HTML literal.
+    :return: A document fragment representing the HTML literal.
+    :raises: `html5lib.html5parser.ParseError` if the lexical form is
+        not valid HTML.
+    """
     parser = html5lib.HTMLParser(
         tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
     )
-    retval = parser.parseFragment(htmltext)
-    retval.normalize()
-    return retval
+    result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
+    result.normalize()
+    return result
+
+
+def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
+    """
+    Serialize a document fragment representing an HTML literal into
+    its lexical form.
+
+    :param value: A document fragment representing an HTML literal.
+    :return: The lexical form of the HTML literal.
+    """
+    result = html5lib.serialize(value, tree="dom")
+    return result
 
 
 def _writeXML(  # noqa: N802
@@ -1967,14 +1990,21 @@ def _castPythonToLiteral(  # noqa: N802
     (Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
     (timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
     (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
-    # this is a bit dirty - by accident the html5lib parser produces
-    # DocumentFragments, and the xml parser Documents, letting this
-    # decide what datatype to use makes roundtripping easier, but it a
-    # bit random
-    (xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL)),
     (Fraction, (None, _OWL_RATIONAL)),
 ]
 
+if html5lib is not None:
+    # This is a bit dirty, by accident the html5lib parser produces
+    # DocumentFragments, and the xml parser Documents, letting this
+    # decide what datatype to use makes roundtripping easier, but it a
+    # bit random.
+    #
+    # This must happen before _GenericPythonToXSDRules is assigned to
+    # _OriginalGenericPythonToXSDRules.
+    _GenericPythonToXSDRules.append(
+        (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
+    )
+
 _OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)
 
 _SpecificPythonToXSDRules: List[
@@ -2025,9 +2055,13 @@ def _castPythonToLiteral(  # noqa: N802
     URIRef(_XSD_PFX + "base64Binary"): b64decode,
     URIRef(_XSD_PFX + "anyURI"): None,
     _RDF_XMLLITERAL: _parseXML,
-    _RDF_HTMLLITERAL: _parseHTML,
 }
 
+if html5lib is not None:
+    # It is probably best to keep this close to the definition of
+    # _GenericPythonToXSDRules so nobody misses it.
+    XSDToPython[_RDF_HTMLLITERAL] = _parse_html
+
 _check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
     URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
     URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,
diff --git a/test/conftest.py b/test/conftest.py
@@ -3,6 +3,8 @@
 
 import pytest
 
+# This is here so that asserts from these modules are formatted for human
+# readibility.
 pytest.register_assert_rewrite("test.utils")
 
 from pathlib import Path  # noqa: E402
@@ -19,17 +21,14 @@
     Union,
 )
 
-from rdflib import Graph
+from rdflib import Graph  # noqa: E402
 
 from .data import TEST_DATA_DIR
 from .utils.earl import EARLReporter  # noqa: E402
 from .utils.httpservermock import ServedBaseHTTPServerMock  # noqa: E402
 
 pytest_plugins = [EARLReporter.__module__]
 
-# This is here so that asserts from these modules are formatted for human
-# readibility.
-
 
 @pytest.fixture(scope="session")
 def http_file_server() -> Generator[HTTPFileServer, None, None]:
diff --git a/test/test_literal/test_literal.py b/test/test_literal/test_literal.py
@@ -1,3 +1,15 @@
+from __future__ import annotations
+
+import builtins
+import datetime
+import logging
+from decimal import Decimal
+from test.utils import affix_tuples
+from test.utils.literal import LiteralChecker, literal_idfn
+from test.utils.namespace import EGDC
+from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
+from typing import Any, Callable, Generator, Optional, Type, Union
+
 # NOTE: The config below enables strict mode for mypy.
 # mypy: no_ignore_errors
 # mypy: warn_unused_configs, disallow_any_generics
@@ -7,14 +19,13 @@
 # mypy: no_implicit_optional, warn_redundant_casts, warn_unused_ignores
 # mypy: warn_return_any, no_implicit_reexport, strict_equality
 
-import datetime
-import logging
-from decimal import Decimal
-from test.utils import affix_tuples
-from test.utils.literal import LiteralChecker
-from test.utils.namespace import EGDC
-from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
-from typing import Any, Callable, Generator, Optional, Type, Union
+
+try:
+    import html5lib as _  # noqa: F401
+
+    _HAVE_HTML5LIB = True
+except ImportError:
+    _HAVE_HTML5LIB = False
 
 import isodate
 import pytest
@@ -915,6 +926,21 @@ def unlexify(s: str) -> str:
     )
 
 
+class _UnknownType:
+    """
+    A class that is not known to rdflib, used to test the how
+    rdflib.term.Literal handles unknown python types.
+    """
+
+    def __repr__(self) -> str:
+        return "_UnknownType()"
+
+    def __eq__(self, __value: object) -> bool:
+        if isinstance(__value, _UnknownType):
+            return True
+        return False
+
+
 @pytest.mark.parametrize(
     ["literal_maker", "outcome"],
     [
@@ -951,7 +977,30 @@ def unlexify(s: str) -> str:
             lambda: Literal(Literal("blue sky", "en")),
             Literal("blue sky", "en"),
         ),
+        (
+            lambda: Literal("<body>", datatype=RDF.HTML),
+            LiteralChecker(
+                ..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
+            ),
+        ),
+        (
+            lambda: Literal("<table></table>", datatype=RDF.HTML),
+            LiteralChecker(
+                ...,
+                None,
+                RDF.HTML,
+                False if _HAVE_HTML5LIB else None,
+                "<table></table>",
+            ),
+        ),
+        (
+            lambda: Literal(_UnknownType(), datatype=EGDC.UnknownType),
+            LiteralChecker(
+                _UnknownType(), None, EGDC.UnknownType, None, "_UnknownType()"
+            ),
+        ),
     ],
+    ids=literal_idfn,
 )
 def test_literal_construction(
     literal_maker: Callable[[], Literal],
@@ -961,3 +1010,41 @@ def test_literal_construction(
     with checker.context():
         actual_outcome = literal_maker()
         checker.check(actual_outcome)
+
+
+@pytest.mark.parametrize(
+    ["literal_maker", "normalize_literals", "outcome"],
+    [
+        (
+            lambda: Literal("001000", datatype=XSD.integer),
+            ...,
+            LiteralChecker(1000, None, XSD.integer, False, "1000"),
+        ),
+        (
+            lambda: Literal("001000", datatype=XSD.integer),
+            True,
+            LiteralChecker(1000, None, XSD.integer, False, "1000"),
+        ),
+        (
+            lambda: Literal("001000", datatype=XSD.integer),
+            False,
+            LiteralChecker(1000, None, XSD.integer, False, "001000"),
+        ),
+    ],
+    ids=literal_idfn,
+)
+def test_global_normalize(
+    literal_maker: Callable[[], Literal],
+    normalize_literals: Union[builtins.ellipsis, bool],
+    outcome: OutcomePrimitives[Literal],
+) -> None:
+    _normalize_literals = rdflib.NORMALIZE_LITERALS
+    try:
+        if normalize_literals is not ...:
+            rdflib.NORMALIZE_LITERALS = normalize_literals
+        checker = OutcomeChecker[Literal].from_primitives(outcome)
+        with checker.context():
+            actual_outcome = literal_maker()
+            checker.check(actual_outcome)
+    finally:
+        rdflib.NORMALIZE_LITERALS = _normalize_literals
diff --git a/test/test_literal/test_literal_html5lib.py b/test/test_literal/test_literal_html5lib.py
@@ -0,0 +1,79 @@
+import xml.dom.minidom
+from test.utils.literal import LiteralChecker
+from test.utils.outcome import OutcomeChecker, OutcomePrimitives
+from typing import Callable
+
+import pytest
+
+import rdflib.term
+from rdflib.namespace import RDF
+from rdflib.term import Literal
+
+try:
+    import html5lib as _  # noqa: F401
+except ImportError:
+    pytest.skip("html5lib not installed", allow_module_level=True)
+
+
+def test_has_html5lib() -> None:
+    assert rdflib.term._HAS_HTML5LIB is True
+    assert RDF.HTML in rdflib.term.XSDToPython
+    rule = next(
+        (
+            item
+            for item in rdflib.term._GenericPythonToXSDRules
+            if item[0] is xml.dom.minidom.DocumentFragment
+        ),
+        None,
+    )
+    assert rule is not None
+    assert rule[1][1] == RDF.HTML
+
+
+@pytest.mark.parametrize(
+    ["factory", "outcome"],
+    [
+        # Ill-typed literals, these have lexical forms that result in
+        # errors when parsed as HTML by html5lib.
+        (
+            lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
+            LiteralChecker(
+                ..., None, RDF.HTML, True, "<body><h1>Hello, World!</h1></body>"
+            ),
+        ),
+        (
+            lambda: Literal("<body></body>", datatype=RDF.HTML),
+            LiteralChecker(..., None, RDF.HTML, True, "<body></body>"),
+        ),
+        (
+            lambda: Literal("<tr><td>THE TEXT IS IN HERE</td></tr>", datatype=RDF.HTML),
+            LiteralChecker(
+                ..., None, RDF.HTML, True, "<tr><td>THE TEXT IS IN HERE</td></tr>"
+            ),
+        ),
+        # Well-typed literals, these have lexical forms that parse
+        # without errors with html5lib.
+        (
+            lambda: Literal("<table></table>", datatype=RDF.HTML),
+            LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),
+        ),
+        (
+            lambda: Literal("  <table>  </table>  ", datatype=RDF.HTML, normalize=True),
+            LiteralChecker(..., None, RDF.HTML, False, "  <table>  </table>  "),
+        ),
+        (
+            lambda: Literal(
+                "  <table>  </table>  ", datatype=RDF.HTML, normalize=False
+            ),
+            LiteralChecker(..., None, RDF.HTML, False, "  <table>  </table>  "),
+        ),
+    ],
+)
+def test_literal_construction(
+    factory: Callable[[], Literal],
+    outcome: OutcomePrimitives[Literal],
+) -> None:
+    checker = OutcomeChecker[Literal].from_primitives(outcome)
+    with checker.context():
+        actual_outcome = factory()
+        checker.check(actual_outcome)
diff --git a/test/test_sparql/test_sparql.py b/test/test_sparql/test_sparql.py
@@ -844,6 +844,23 @@ def thrower(*args: Any, **kwargs: Any) -> None:
             ],
             id="select-group-concat-optional-many",
         ),
+        pytest.param(
+            """
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+            SELECT * WHERE {
+                BIND(STRDT("<body>", rdf:HTML) as ?tag1) # incorrectly disappearing literal
+                BIND("<body>" as ?tag2)                  # correctly appearing literal
+            }
+            """,
+            [
+                {
+                    Variable("tag1"): Literal("<body>", datatype=RDF.HTML),
+                    Variable("tag2"): Literal("<body>"),
+                }
+            ],
+            id="select-bind-strdt-html",
+        ),
     ],
 )
 def test_queries(
diff --git a/test/utils/literal.py b/test/utils/literal.py
diff --git a/tox.ini b/tox.ini