Skip to content

Commit

Permalink
fixes bugs after xml changes
Browse files Browse the repository at this point in the history
  • Loading branch information
huettenhain committed Sep 27, 2024
1 parent 4325741 commit cbfee53
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 43 deletions.
50 changes: 31 additions & 19 deletions refinery/lib/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@
import defusedxml.ElementTree as et
import collections

from typing import Any, Dict, Iterable, List, Optional
from typing import Any, Dict, Iterable, List, Optional, TYPE_CHECKING
from xml.parsers import expat
from xml.etree.ElementTree import Element, ElementTree

from refinery.lib.structures import MemoryFile
from refinery.lib.tools import exception_to_string

if TYPE_CHECKING:
from typing import Self


def ForgivingParse(data: bytes, entities=None) -> ElementTree:
"""
Expand Down Expand Up @@ -82,18 +85,18 @@ class XMLNodeBase:
__slots__ = 'tag', 'index', 'children', 'empty', 'attributes', 'content', '_parent', '__weakref__'

attributes: Dict[str, Any]
children: List[XMLNodeBase]
children: List[Self]
content: Optional[str]
parent: Optional[weakref.ProxyType[XMLNodeBase]]
subtree: Iterable[XMLNodeBase]
parent: Optional[weakref.ProxyType[Self]]
subtree: Iterable[Self]
empty: bool
tag: Optional[str]

def __init__(
self,
tag: str,
index: Optional[int],
parent: Optional[XMLNodeBase] = None,
tag: Optional[str],
index: Optional[int] = None,
parent: Optional[Self] = None,
content: Optional[str] = None,
empty: bool = False,
attributes: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -123,9 +126,6 @@ def parent(self, parent):
parent = weakref.ref(parent)
self._parent = parent

def __hash__(self):
return hash((hash(self.parent), self.tag, self.index))

def __eq__(self, other: XMLNodeBase):
return self.parent == other.parent and self.tag == other.tag and self.index == other.index

Expand Down Expand Up @@ -155,6 +155,21 @@ def __getitem__(self, key):
def get_attribute(self, key, default=None):
return self.attributes.get(key, default)

def reindex(self):
"""
Computes the index values of all nodes in the subtree.
"""
pre_count = collections.Counter(child.tag for child in self.children)
tag_count = collections.Counter()
for child in self.children:
tag = child.tag
if pre_count[tag] == 1:
child.index = None
else:
tag_count[tag] += 1
child.index = tag_count[tag]
child.reindex()

def child(self, tag: str):
"""
Return the first child with the given tag. This is useful especialyl for documents where
Expand All @@ -166,7 +181,7 @@ def child(self, tag: str):
raise LookupError(tag)

@property
def subtree(self) -> Iterable[XMLNodeBase]:
def subtree(self) -> Iterable[Self]:
"""
Iterate all items that are reachable from the current node.
"""
Expand All @@ -189,8 +204,8 @@ class XMLNode(XMLNodeBase):

source: Optional[Element]

def __init__(self, tag: str, index: int, parent: Optional[XMLNode] = None, source: Optional[Element] = None):
super().__init__(tag, index, parent)
def __init__(self, tag: str, parent: Optional[Self] = None, source: Optional[Element] = None):
super().__init__(tag, parent=parent)
self.source = source

def write(self, stream):
Expand All @@ -209,19 +224,16 @@ def parse(data) -> XMLNode:
tree that is generated by the standard library.
"""
def translate(element: Element, cursor: XMLNode, level: int = 0):
total = collections.Counter(child.tag for child in element)
count = collections.Counter()
for child in element:
tag = child.tag
index = None if total[tag] == 1 else count[tag]
node = XMLNode(tag, index, cursor, child)
count[tag] += 1
node = XMLNode(tag, cursor, child)
translate(child, node, level + 1)
cursor.children.append(node)
cursor.attributes = element.attrib
cursor.content = element.text or element.tail or ''
return cursor
root = ForgivingParse(data).getroot()
rt = translate(root, XMLNode(root.tag, None))
rt = translate(root, XMLNode(root.tag))
rt.source = root
rt.reindex()
return rt
35 changes: 13 additions & 22 deletions refinery/units/formats/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from refinery.lib.meta import metavars
from refinery.units.formats import XMLToPathExtractorUnit, UnpackResult, Arg

import io

from collections import Counter
from io import StringIO
from html.parser import HTMLParser

_HTML_DATA_ROOT_TAG = 'html'
Expand All @@ -27,7 +25,7 @@ def root(self) -> bool:
return self.tag == _HTML_DATA_ROOT_TAG

def recover(self, inner=True) -> str:
with io.StringIO() as stream:
with StringIO() as stream:
if not inner:
stream.write(self.content)
for child in self.children:
Expand Down Expand Up @@ -64,7 +62,7 @@ def __init__(self) -> None:
def handle_starttag(self, tag: str, attributes):
if tag in self._SELF_CLOSING_TAGS:
return
node = HTMLNode(tag, self.tos, self.get_starttag_text(), attributes={
node = HTMLNode(tag, None, self.tos, self.get_starttag_text(), attributes={
key: value for key, value in attributes if key and value})
children = self.tos.children
previous = children[-1] if children else None
Expand Down Expand Up @@ -92,7 +90,7 @@ def handle_entityref(self, name: str) -> None:
if last.textual:
last.content += ntt
return
self.tos.children.append(HTMLNode(None, self.tos, ntt))
self.tos.children.append(HTMLNode(None, None, self.tos, ntt))

def handle_charref(self, name: str) -> None:
self.handle_entityref(F'#{name}')
Expand All @@ -113,7 +111,7 @@ def handle_endtag(self, tag: str):
self.tos = cursor.parent

def handle_data(self, data):
self.tos.children.append(HTMLNode(None, self.tos, data))
self.tos.children.append(HTMLNode(None, None, self.tos, data))


class xthtml(XMLToPathExtractorUnit):
Expand All @@ -133,15 +131,20 @@ def unpack(self, data):
html = HTMLTreeParser()
html.feed(data.decode(self.codec))
root = html.tos
root.reindex()

meta = metavars(data)
path = self._make_path_builder(meta, root)

while root.parent:
self.log_info(F'tag was not closed: {root.tag}')
root = root.parent

while len(root.children) == 1 and root.children[0].tag == root.tag:
root, = root.children
while len(root.children) == 1:
child, = root.children
if child.tag != root.tag:
break
root = child

def tree(root: HTMLNode, *parts: str):

Expand All @@ -164,22 +167,10 @@ def inner(root: HTMLNode = root):
else:
yield UnpackResult(tagpath, inner, **meta)

tag_pre_count = Counter()
tag_run_count = Counter()
for child in root.children:
if child.textual:
continue
tag_pre_count[child.tag] += 1

for child in root.children:
if child.textual:
continue
if tag_pre_count[child.tag] == 1:
yield from tree(child, *parts, path(child))
continue
tag_run_count[child.tag] += 1
index = tag_run_count[child.tag]
yield from tree(child, *parts, path(child, index))
yield from tree(child, *parts, path(child))

yield from tree(root, path(root))

Expand Down
2 changes: 0 additions & 2 deletions refinery/units/formats/xml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from collections import Counter

from refinery.lib.structures import MemoryFile
from refinery.lib.meta import metavars
from refinery.lib import xml
Expand Down

0 comments on commit cbfee53

Please sign in to comment.