HazyResearch · lukehsiao · Nov 26, 2018 · Nov 25, 2018 · Nov 25, 2018 · Nov 26, 2018
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,13 @@
 [Unreleased]
 ------------
 
+Added
+^^^^^^^
+* `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and
+  add name column for each context in data model.
+  (`#175 <https://github.com/HazyResearch/fonduer/pull/175>`_)
+
+
 [0.3.6] - 2018-11-15
 --------------------
 

diff --git a/src/fonduer/parser/models/caption.py b/src/fonduer/parser/models/caption.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -18,6 +18,9 @@ class Caption(Context):
     #: The position of the ``Caption`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Caption``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
     #: The parent ``Document``.

diff --git a/src/fonduer/parser/models/figure.py b/src/fonduer/parser/models/figure.py
@@ -19,6 +19,9 @@ class Figure(Context):
     #: The position of the ``Figure`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Figure``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id", ondelete="CASCADE"))
     #: The parent ``Document``.

diff --git a/src/fonduer/parser/models/paragraph.py b/src/fonduer/parser/models/paragraph.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -18,6 +18,9 @@ class Paragraph(Context):
     #: The position of the ``Paragraph`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Paragraph``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
     #: The parent ``Document``.

diff --git a/src/fonduer/parser/models/section.py b/src/fonduer/parser/models/section.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -17,6 +17,9 @@ class Section(Context):
     #: The unique id of the ``Section``.
     id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)
 
+    #: The name of a ``Section``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The position of the ``Section`` in a ``Document``.
     position = Column(Integer, nullable=False)
 

diff --git a/src/fonduer/parser/models/sentence.py b/src/fonduer/parser/models/sentence.py
@@ -261,6 +261,9 @@ class Sentence(
     #: The position of the ``Sentence`` in the ``Document``.
     position = Column(Integer, nullable=False)  # unique sentence number per document
 
+    #: The name of a ``Sentence``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
 

diff --git a/src/fonduer/parser/models/table.py b/src/fonduer/parser/models/table.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -18,6 +18,9 @@ class Table(Context):
     #: The position of the ``Table`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Table``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
     #: The parent ``Document``.
@@ -64,6 +67,9 @@ class Cell(Context):
     #: The position of the ``Cell`` in the ``Table``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Cell``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Table``.
     table_id = Column(Integer, ForeignKey("table.id"))
     #: The parent ``Table``.

diff --git a/src/fonduer/parser/models/webpage.py b/src/fonduer/parser/models/webpage.py
@@ -1,17 +1,19 @@
 from sqlalchemy import Column, ForeignKey, Integer, String
 
-from fonduer.parser.models.document import Document
+from fonduer.parser.models.context import Context
 
 
-class Webpage(Document):
-    """A Webpage document context enhanced with additional metadata."""
+class Webpage(Context):
+    """A Webpage Context enhanced with additional metadata."""
 
     __tablename__ = "webpage"
 
     #: The unique id of the ``Webpage``.
-    id = Column(
-        Integer, ForeignKey("document.id", ondelete="CASCADE"), primary_key=True
-    )
+    id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)
+
+    #: The name of a ``Webpage``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The URL of the ``Webpage``.
     url = Column(String)
     #: The host of the ``Webpage``.

diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py
@@ -276,10 +276,15 @@ def _parse_table(self, node, state):
             stable_id = "{}::{}:{}".format(
                 state["document"].name, "table", state["table"]["idx"]
             )
+
+            # Set name for Table
+            name = node.attrib["name"] if "name" in node.attrib else None
+
             # Create the Table in the DB
             parts = {}
             parts["document"] = state["document"]
             parts["stable_id"] = stable_id
+            parts["name"] = name
             parts["position"] = table_idx
             parent = state["parent"][node]
             if isinstance(parent, Cell):
@@ -342,9 +347,13 @@ def _parse_table(self, node, state):
             ):
                 state["table"][state["parent"][node].position]["grid"][(r, c)] = 1
 
+            # Set name for Cell
+            name = node.attrib["name"] if "name" in node.attrib else None
+
             # construct cell
             parts = defaultdict(list)
             parts["document"] = state["document"]
+            parts["name"] = name
             parts["table"] = state["parent"][node]
             parts["row_start"] = row_start
             parts["row_end"] = row_end
@@ -380,11 +389,14 @@ def _parse_figure(self, node, state):
         if node.tag not in ["img", "figure"]:
             return state
 
-        # Process the figure
+        # Process the Figure
         stable_id = "{}::{}:{}".format(
             state["document"].name, "figure", state["figure"]["idx"]
         )
 
+        # Set name for Figure
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         # img within a Figure get's processed in the parent Figure
         if node.tag == "img" and isinstance(state["parent"][node], Figure):
             return state
@@ -403,6 +415,7 @@ def _parse_figure(self, node, state):
 
         parts["document"] = state["document"]
         parts["stable_id"] = stable_id
+        parts["name"] = name
         parts["position"] = state["figure"]["idx"]
 
         # If processing a raw img
@@ -443,6 +456,10 @@ def _parse_sentence(self, paragraph, node, state):
         """
         text = state["paragraph"]["text"]
         field = state["paragraph"]["field"]
+
+        # Set name for Sentence
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         # Lingual Parse
         document = state["document"]
         for parts in self.tokenize_and_split_sentences(document, text):
@@ -460,6 +477,7 @@ def _parse_sentence(self, paragraph, node, state):
                 state["sentence"]["abs_offset"],
                 abs_sentence_offset_end,
             )
+            parts["name"] = name
             state["sentence"]["abs_offset"] = abs_sentence_offset_end
             if self.structural:
                 context_node = node.getparent() if field == "tail" else node
@@ -549,6 +567,8 @@ def _parse_paragraph(self, node, state):
             if node in state["context"]
             else state["parent"][node]
         )
+        # Set name for Paragraph
+        name = node.attrib["name"] if "name" in node.attrib else None
 
         for field in ["text", "tail"]:
             text = getattr(node, field)
@@ -568,6 +588,7 @@ def _parse_paragraph(self, node, state):
             )
             parts = {}
             parts["stable_id"] = stable_id
+            parts["name"] = name
             parts["document"] = state["document"]
             parts["position"] = state["paragraph"]["idx"]
             if isinstance(parent, Caption):
@@ -603,22 +624,27 @@ def _parse_paragraph(self, node, state):
     def _parse_section(self, node, state):
         """Parse a Section of the node.
 
-        Note that this implementation currently just creates a single Section
-        for a document.
+        Note that this implementation currently creates a Section at the
+        beginning of the document and creates Section based on tag of node.
 
         :param node: The lxml node to parse
         :param state: The global state necessary to place the node in context
             of the document as a whole.
         """
-        if node.tag != "html":
+        if node.tag not in ["html", "section"]:
             return state
 
         # Add a Section
         stable_id = "{}::{}:{}".format(
             state["document"].name, "section", state["section"]["idx"]
         )
+
+        # Set name for Section
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         state["context"][node] = Section(
             document=state["document"],
+            name=name,
             stable_id=stable_id,
             position=state["section"]["idx"],
         )
@@ -641,12 +667,17 @@ def _parse_caption(self, node, state):
         stable_id = "{}::{}:{}".format(
             state["document"].name, "caption", state["caption"]["idx"]
         )
+
+        # Set name for Section
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         if isinstance(parent, Table):
             state["context"][node] = Caption(
                 document=state["document"],
                 table=parent,
                 figure=None,
                 stable_id=stable_id,
+                name=name,
                 position=state["caption"]["idx"],
             )
         elif isinstance(parent, Figure):
@@ -655,6 +686,7 @@ def _parse_caption(self, node, state):
                 table=None,
                 figure=parent,
                 stable_id=stable_id,
+                name=name,
                 position=state["caption"]["idx"],
             )
         else:

diff --git a/tests/data/pure_html/radiology.html b/tests/data/pure_html/radiology.html
@@ -0,0 +1,14 @@
+<html>
+    <section name='label'>
+        <p name='label'>0</p>
+    </section>
+    <section name='content'>
+        <p name='COMPARISON'>None.</p>
+        <p name='INDICATION'>Positive TB test</p>
+        <p name='FINDINGS'>The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.</p> <p name='IMPRESSION'>Normal chest x-XXXX.</p>
+    </section>
+    <section name='image'>
+        <img src='CXR1_1_IM-0001-3001.png'/>
+        <img src='/Users/senwu/Desktop/untitled folder/deleteme/test/test/debug/mmm/data/image/CXR1_1_IM-0001-4001.png'/>
+    </section>
+</html>
diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py
@@ -539,3 +539,33 @@ def test_parse_error_doc_skipping(caplog):
     parser_udf = get_parser_udf(structural=True, lingual=True)
     sentence_lists = [x for x in parser_udf.apply(doc)]
     assert len(sentence_lists) == 37
+
+
+def test_parse_multi_sections(caplog):
+    """Test the parser with the radiology document."""
+    caplog.set_level(logging.INFO)
+
+    # Test multi-section html
+    docs_path = "tests/data/pure_html/radiology.html"
+    preprocessor = HTMLDocPreprocessor(docs_path)
+    doc = next(preprocessor._parse_file(docs_path, "radiology"))
+    parser_udf = get_parser_udf(
+        structural=True, tabular=True, lingual=True, visual=False
+    )
+    for _ in parser_udf.apply(doc):
+        pass
+
+    assert len(doc.sections) == 4
+    assert len(doc.paragraphs) == 5
+    assert len(doc.sentences) == 9
+    assert len(doc.figures) == 2
+
+    assert doc.sections[0].name is None
+    assert doc.sections[1].name == "label"
+    assert doc.sections[2].name == "content"
+    assert doc.sections[3].name == "image"
+
+    assert doc.sections[2].paragraphs[0].name == "COMPARISON"
+    assert doc.sections[2].paragraphs[1].name == "INDICATION"
+    assert doc.sections[2].paragraphs[2].name == "FINDINGS"
+    assert doc.sections[2].paragraphs[3].name == "IMPRESSION"