From 075cb7cf7a2e332ba41e60f812878885434af89b Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Tue, 27 Feb 2024 14:10:31 -0500 Subject: [PATCH] Make parsing of text be non-quadratic. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Python, appending strings is not guaranteed to be constant-time, since they are documented to be immutable. In some corner cases, CPython is able to make these operations constant-time, but reaching into ETree objects is not such a case. This leads to parse times being quadratic in the size of the text in the input in pathological cases where parsing outputs a large number of adjacent text nodes which must be combined (e.g. HTML-escaped values). Specifically, we expect doubling the size of the input to result in approximately doubling the time to parse; instead, we observe quadratic behavior: ``` In [1]: import html5lib In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000) 2.99 s ± 269 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000) 6.7 s ± 242 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000) 19.5 s ± 1.48 s per loop (mean ± std. dev. of 5 runs, 1 loop each) ``` Switch from appending to the internal `str`, to appending text to an array of text chunks, as appends can be done in constant time. Using `bytearray` is a similar solution, but benchmarks slightly worse because the strings must be encoded before being appended. This improves parsing of text documents noticeably: ``` In [1]: import html5lib In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000) 2.3 s ± 373 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000) 3.85 s ± 29.7 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000) 8.04 s ± 317 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) ``` --- html5lib/treebuilders/etree.py | 81 ++++++++++++++++++++++------------ html5lib/treewalkers/etree.py | 12 +++-- 2 files changed, 60 insertions(+), 33 deletions(-) diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 0b745081..c0507980 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -16,6 +16,23 @@ tag_regexp = re.compile("{([^}]*)}(.*)") +class TextBuffer: + def __init__(self, initial=""): + self.chunks = [initial] + + def __str__(self): + return "".join(self.chunks) + + def getvalue(self): + return "".join(self.chunks) + + def append(self, other): + self.chunks.append(other) + + def __eq__(self, other): + return self.getvalue() == other + + def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag @@ -110,25 +127,25 @@ def removeChild(self, node): def insertText(self, data, insertBefore=None): if not len(self._element): if not self._element.text: - self._element.text = "" - self._element.text += data + self._element.text = TextBuffer("") + self._element.text.append(data) elif insertBefore is None: # Insert the text as the tail of the last child element if not self._element[-1].tail: - self._element[-1].tail = "" - self._element[-1].tail += data + self._element[-1].tail = TextBuffer("") + self._element[-1].tail.append(data) else: # Insert the text before the specified node children = list(self._element) index = children.index(insertBefore._element) if index > 0: if not self._element[index - 1].tail: - self._element[index - 1].tail = "" - self._element[index - 1].tail += data + self._element[index - 1].tail = TextBuffer("") + self._element[index - 1].tail.append(data) else: if not self._element.text: - self._element.text = "" - self._element.text += data + self._element.text = TextBuffer("") + self._element.text.append(data) def cloneNode(self): element = type(self)(self.name, self.namespace) @@ -138,13 +155,15 @@ def cloneNode(self): def reparentChildren(self, newParent): if newParent.childNodes: - newParent.childNodes[-1]._element.tail += self._element.text + newParent.childNodes[-1]._element.tail.append( + self._element.text.getvalue() + ) else: if not newParent._element.text: - newParent._element.text = "" + newParent._element.text = TextBuffer("") if self._element.text is not None: - newParent._element.text += self._element.text - self._element.text = "" + newParent._element.text.append(self._element.text.getvalue()) + self._element.text = TextBuffer("") base.Node.reparentChildren(self, newParent) class Comment(Element): @@ -152,22 +171,23 @@ def __init__(self, data): # Use the superclass constructor to set all properties on the # wrapper element self._element = ElementTree.Comment(data) + self._element.text = TextBuffer(data) self.parent = None self._childNodes = [] self._flags = [] def _getData(self): - return self._element.text + return self._element.text.getvalue() def _setData(self, value): - self._element.text = value + self._element.text = TextBuffer(value) data = property(_getData, _setData) class DocumentType(Element): def __init__(self, name, publicId, systemId): Element.__init__(self, "") - self._element.text = name + self._element.text = TextBuffer(name) self.publicId = publicId self.systemId = systemId @@ -208,19 +228,19 @@ def serializeElement(element, indent=0): publicId = element.get("publicId") or "" systemId = element.get("systemId") or "" rv.append("""""" % - (element.text, publicId, systemId)) + (element.text.getvalue(), publicId, systemId)) else: - rv.append("" % (element.text,)) + rv.append("" % (element.text.getvalue(),)) elif element.tag == "DOCUMENT_ROOT": rv.append("#document") if element.text is not None: - rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) + rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue())) if element.tail is not None: raise TypeError("Document node cannot have tail") if hasattr(element, "attrib") and len(element.attrib): raise TypeError("Document node cannot have attributes") elif element.tag == ElementTreeCommentType: - rv.append("|%s" % (' ' * indent, element.text)) + rv.append("|%s" % (' ' * indent, element.text.getvalue())) else: assert isinstance(element.tag, text_type), \ "Expected unicode, got %s, %s" % (type(element.tag), element.tag) @@ -248,13 +268,14 @@ def serializeElement(element, indent=0): for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) - if element.text: - rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) + if element.text and element.text.getvalue(): + rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue())) indent += 2 for child in element: serializeElement(child, indent) if element.tail: - rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) + rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail.getvalue())) + serializeElement(element, 0) return "\n".join(rv) @@ -272,13 +293,15 @@ def serializeElement(element): if element.get("publicId") or element.get("systemId"): publicId = element.get("publicId") or "" systemId = element.get("systemId") or "" - rv.append("""""" % - (element.text, publicId, systemId)) + rv.append( + """""" + % (element.text.getvalue(), publicId, systemId) + ) else: - rv.append("" % (element.text,)) + rv.append("" % (element.text.getvalue(),)) elif element.tag == "DOCUMENT_ROOT": if element.text is not None: - rv.append(element.text) + rv.append(element.text.getvalue()) if element.tail is not None: raise TypeError("Document node cannot have tail") if hasattr(element, "attrib") and len(element.attrib): @@ -288,7 +311,7 @@ def serializeElement(element): serializeElement(child) elif element.tag == ElementTreeCommentType: - rv.append("" % (element.text,)) + rv.append("" % (element.text.getvalue(),)) else: # This is assumed to be an ordinary element if not element.attrib: @@ -299,7 +322,7 @@ def serializeElement(element): for name, value in element.attrib.items()]) rv.append("<%s %s>" % (element.tag, attr)) if element.text: - rv.append(element.text) + rv.append(element.text.getvalue()) for child in element: serializeElement(child) @@ -307,7 +330,7 @@ def serializeElement(element): rv.append("" % (element.tag,)) if element.tail: - rv.append(element.tail) + rv.append(element.tail.getvalue()) serializeElement(element) diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 411a1d45..47c8577e 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -33,7 +33,7 @@ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element elt, _, _, flag = node if flag in ("text", "tail"): - return base.TEXT, getattr(elt, flag) + return base.TEXT, getattr(elt, flag).getvalue() else: node = elt @@ -44,11 +44,15 @@ def getNodeDetails(self, node): return (base.DOCUMENT,) elif node.tag == "": - return (base.DOCTYPE, node.text, - node.get("publicId"), node.get("systemId")) + return ( + base.DOCTYPE, + node.text.getvalue(), + node.get("publicId"), + node.get("systemId"), + ) elif node.tag == ElementTreeCommentType: - return base.COMMENT, node.text + return base.COMMENT, node.text.getvalue() else: assert isinstance(node.tag, string_types), type(node.tag)