Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve data model #182

Merged
merged 6 commits into from
Nov 26, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
[Unreleased]
------------

Added
^^^^^^^
* `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and
add name column for each context in data model.
(`#175 <https://github.com/HazyResearch/fonduer/pull/175>`_)


[0.3.6] - 2018-11-15
--------------------

Expand Down
5 changes: 4 additions & 1 deletion src/fonduer/parser/models/caption.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.orm import backref, relationship

from fonduer.parser.models.context import Context
Expand All @@ -18,6 +18,9 @@ class Caption(Context):
#: The position of the ``Caption`` in the ``Document``.
position = Column(Integer, nullable=False)

#: The name of a ``Caption``.
name = Column(String, unique=False, nullable=True)

#: The id of the parent ``Document``.
document_id = Column(Integer, ForeignKey("document.id"))
#: The parent ``Document``.
Expand Down
3 changes: 3 additions & 0 deletions src/fonduer/parser/models/figure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class Figure(Context):
#: The position of the ``Figure`` in the ``Document``.
position = Column(Integer, nullable=False)

#: The name of a ``Figure``.
name = Column(String, unique=False, nullable=True)

#: The id of the parent ``Document``.
document_id = Column(Integer, ForeignKey("document.id", ondelete="CASCADE"))
#: The parent ``Document``.
Expand Down
5 changes: 4 additions & 1 deletion src/fonduer/parser/models/paragraph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.orm import backref, relationship

from fonduer.parser.models.context import Context
Expand All @@ -18,6 +18,9 @@ class Paragraph(Context):
#: The position of the ``Paragraph`` in the ``Document``.
position = Column(Integer, nullable=False)

#: The name of a ``Paragraph``.
name = Column(String, unique=False, nullable=True)

#: The id of the parent ``Document``.
document_id = Column(Integer, ForeignKey("document.id"))
#: The parent ``Document``.
Expand Down
5 changes: 4 additions & 1 deletion src/fonduer/parser/models/section.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.orm import backref, relationship

from fonduer.parser.models.context import Context
Expand All @@ -17,6 +17,9 @@ class Section(Context):
#: The unique id of the ``Section``.
id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)

#: The name of a ``Section``.
name = Column(String, unique=False, nullable=True)

#: The position of the ``Section`` in a ``Document``.
position = Column(Integer, nullable=False)

Expand Down
3 changes: 3 additions & 0 deletions src/fonduer/parser/models/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,9 @@ class Sentence(
#: The position of the ``Sentence`` in the ``Document``.
position = Column(Integer, nullable=False) # unique sentence number per document

#: The name of a ``Sentence``.
name = Column(String, unique=False, nullable=True)

#: The id of the parent ``Document``.
document_id = Column(Integer, ForeignKey("document.id"))

Expand Down
8 changes: 7 additions & 1 deletion src/fonduer/parser/models/table.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.orm import backref, relationship

from fonduer.parser.models.context import Context
Expand All @@ -18,6 +18,9 @@ class Table(Context):
#: The position of the ``Table`` in the ``Document``.
position = Column(Integer, nullable=False)

#: The name of a ``Table``.
name = Column(String, unique=False, nullable=True)

#: The id of the parent ``Document``.
document_id = Column(Integer, ForeignKey("document.id"))
#: The parent ``Document``.
Expand Down Expand Up @@ -64,6 +67,9 @@ class Cell(Context):
#: The position of the ``Cell`` in the ``Table``.
position = Column(Integer, nullable=False)

#: The name of a ``Cell``.
name = Column(String, unique=False, nullable=True)

#: The id of the parent ``Table``.
table_id = Column(Integer, ForeignKey("table.id"))
#: The parent ``Table``.
Expand Down
14 changes: 8 additions & 6 deletions src/fonduer/parser/models/webpage.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from sqlalchemy import Column, ForeignKey, Integer, String

from fonduer.parser.models.document import Document
from fonduer.parser.models.context import Context


class Webpage(Document):
"""A Webpage document context enhanced with additional metadata."""
class Webpage(Context):
"""A Webpage Context enhanced with additional metadata."""

__tablename__ = "webpage"

#: The unique id of the ``Webpage``.
id = Column(
Integer, ForeignKey("document.id", ondelete="CASCADE"), primary_key=True
)
id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)

#: The name of a ``Webpage``.
name = Column(String, unique=False, nullable=True)

#: The URL of the ``Webpage``.
url = Column(String)
#: The host of the ``Webpage``.
Expand Down
40 changes: 36 additions & 4 deletions src/fonduer/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,10 +276,15 @@ def _parse_table(self, node, state):
stable_id = "{}::{}:{}".format(
state["document"].name, "table", state["table"]["idx"]
)

# Set name for Table
name = node.attrib["name"] if "name" in node.attrib else None

# Create the Table in the DB
parts = {}
parts["document"] = state["document"]
parts["stable_id"] = stable_id
parts["name"] = name
parts["position"] = table_idx
parent = state["parent"][node]
if isinstance(parent, Cell):
Expand Down Expand Up @@ -342,9 +347,13 @@ def _parse_table(self, node, state):
):
state["table"][state["parent"][node].position]["grid"][(r, c)] = 1

# Set name for Cell
name = node.attrib["name"] if "name" in node.attrib else None

# construct cell
parts = defaultdict(list)
parts["document"] = state["document"]
parts["name"] = name
parts["table"] = state["parent"][node]
parts["row_start"] = row_start
parts["row_end"] = row_end
Expand Down Expand Up @@ -380,11 +389,14 @@ def _parse_figure(self, node, state):
if node.tag not in ["img", "figure"]:
return state

# Process the figure
# Process the Figure
stable_id = "{}::{}:{}".format(
state["document"].name, "figure", state["figure"]["idx"]
)

# Set name for Figure
name = node.attrib["name"] if "name" in node.attrib else None

# img within a Figure get's processed in the parent Figure
if node.tag == "img" and isinstance(state["parent"][node], Figure):
return state
Expand All @@ -403,6 +415,7 @@ def _parse_figure(self, node, state):

parts["document"] = state["document"]
parts["stable_id"] = stable_id
parts["name"] = name
parts["position"] = state["figure"]["idx"]

# If processing a raw img
Expand Down Expand Up @@ -443,6 +456,10 @@ def _parse_sentence(self, paragraph, node, state):
"""
text = state["paragraph"]["text"]
field = state["paragraph"]["field"]

# Set name for Sentence
name = node.attrib["name"] if "name" in node.attrib else None

# Lingual Parse
document = state["document"]
for parts in self.tokenize_and_split_sentences(document, text):
Expand All @@ -460,6 +477,7 @@ def _parse_sentence(self, paragraph, node, state):
state["sentence"]["abs_offset"],
abs_sentence_offset_end,
)
parts["name"] = name
state["sentence"]["abs_offset"] = abs_sentence_offset_end
if self.structural:
context_node = node.getparent() if field == "tail" else node
Expand Down Expand Up @@ -549,6 +567,8 @@ def _parse_paragraph(self, node, state):
if node in state["context"]
else state["parent"][node]
)
# Set name for Paragraph
name = node.attrib["name"] if "name" in node.attrib else None

for field in ["text", "tail"]:
text = getattr(node, field)
Expand All @@ -568,6 +588,7 @@ def _parse_paragraph(self, node, state):
)
parts = {}
parts["stable_id"] = stable_id
parts["name"] = name
parts["document"] = state["document"]
parts["position"] = state["paragraph"]["idx"]
if isinstance(parent, Caption):
Expand Down Expand Up @@ -603,22 +624,27 @@ def _parse_paragraph(self, node, state):
def _parse_section(self, node, state):
"""Parse a Section of the node.

Note that this implementation currently just creates a single Section
for a document.
Note that this implementation currently creates a Section at the
beginning of the document and creates Section based on tag of node.

:param node: The lxml node to parse
:param state: The global state necessary to place the node in context
of the document as a whole.
"""
if node.tag != "html":
if node.tag not in ["html", "section"]:
return state

# Add a Section
stable_id = "{}::{}:{}".format(
state["document"].name, "section", state["section"]["idx"]
)

# Set name for Section
name = node.attrib["name"] if "name" in node.attrib else None

state["context"][node] = Section(
document=state["document"],
name=name,
stable_id=stable_id,
position=state["section"]["idx"],
)
Expand All @@ -641,12 +667,17 @@ def _parse_caption(self, node, state):
stable_id = "{}::{}:{}".format(
state["document"].name, "caption", state["caption"]["idx"]
)

# Set name for Section
name = node.attrib["name"] if "name" in node.attrib else None

if isinstance(parent, Table):
state["context"][node] = Caption(
document=state["document"],
table=parent,
figure=None,
stable_id=stable_id,
name=name,
position=state["caption"]["idx"],
)
elif isinstance(parent, Figure):
Expand All @@ -655,6 +686,7 @@ def _parse_caption(self, node, state):
table=None,
figure=parent,
stable_id=stable_id,
name=name,
position=state["caption"]["idx"],
)
else:
Expand Down
14 changes: 14 additions & 0 deletions tests/data/pure_html/radiology.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<html>
<section name='label'>
<p name='label'>0</p>
</section>
<section name='content'>
<p name='COMPARISON'>None.</p>
<p name='INDICATION'>Positive TB test</p>
<p name='FINDINGS'>The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.</p> <p name='IMPRESSION'>Normal chest x-XXXX.</p>
</section>
<section name='image'>
<img src='CXR1_1_IM-0001-3001.png'/>
<img src='/Users/senwu/Desktop/untitled folder/deleteme/test/test/debug/mmm/data/image/CXR1_1_IM-0001-4001.png'/>
</section>
</html>
30 changes: 30 additions & 0 deletions tests/parser/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,3 +539,33 @@ def test_parse_error_doc_skipping(caplog):
parser_udf = get_parser_udf(structural=True, lingual=True)
sentence_lists = [x for x in parser_udf.apply(doc)]
assert len(sentence_lists) == 37


def test_parse_multi_sections(caplog):
"""Test the parser with the radiology document."""
caplog.set_level(logging.INFO)

# Test multi-section html
docs_path = "tests/data/pure_html/radiology.html"
preprocessor = HTMLDocPreprocessor(docs_path)
doc = next(preprocessor._parse_file(docs_path, "radiology"))
parser_udf = get_parser_udf(
structural=True, tabular=True, lingual=True, visual=False
)
for _ in parser_udf.apply(doc):
pass

assert len(doc.sections) == 4
assert len(doc.paragraphs) == 5
assert len(doc.sentences) == 9
assert len(doc.figures) == 2

assert doc.sections[0].name is None
assert doc.sections[1].name == "label"
assert doc.sections[2].name == "content"
assert doc.sections[3].name == "image"

assert doc.sections[2].paragraphs[0].name == "COMPARISON"
assert doc.sections[2].paragraphs[1].name == "INDICATION"
assert doc.sections[2].paragraphs[2].name == "FINDINGS"
assert doc.sections[2].paragraphs[3].name == "IMPRESSION"