From 9bd018e6e72c12373ee37499454eda64cc258727 Mon Sep 17 00:00:00 2001
From: Sen Wu <senwu@cs.stanford.edu>
Date: Sun, 25 Nov 2018 15:04:47 -0800
Subject: [PATCH 1/6] support multiple sections

---
 src/fonduer/parser/parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py
index a6bbb607c..145eaeecc 100644
--- a/src/fonduer/parser/parser.py
+++ b/src/fonduer/parser/parser.py
@@ -603,14 +603,14 @@ def _parse_paragraph(self, node, state):
     def _parse_section(self, node, state):
         """Parse a Section of the node.
 
-        Note that this implementation currently just creates a single Section
-        for a document.
+        Note that this implementation currently creates a Section at the
+        beginning of the document and creates Section based on tag of node.
 
         :param node: The lxml node to parse
         :param state: The global state necessary to place the node in context
             of the document as a whole.
         """
-        if node.tag != "html":
+        if node.tag not in ["html", "section"]:
             return state
 
         # Add a Section

From 087ce983af5b30b227b430f025f1e277d63aa92b Mon Sep 17 00:00:00 2001
From: Sen Wu <senwu@cs.stanford.edu>
Date: Sun, 25 Nov 2018 15:09:18 -0800
Subject: [PATCH 2/6] add name column in data model

---
 src/fonduer/parser/models/caption.py   |  5 +++-
 src/fonduer/parser/models/figure.py    |  3 +++
 src/fonduer/parser/models/paragraph.py |  5 +++-
 src/fonduer/parser/models/section.py   |  5 +++-
 src/fonduer/parser/models/sentence.py  |  3 +++
 src/fonduer/parser/models/table.py     |  8 +++++-
 src/fonduer/parser/models/webpage.py   |  4 +++
 src/fonduer/parser/parser.py           | 34 +++++++++++++++++++++++++-
 8 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/src/fonduer/parser/models/caption.py b/src/fonduer/parser/models/caption.py
index aab1aaa17..9750b82bd 100644
--- a/src/fonduer/parser/models/caption.py
+++ b/src/fonduer/parser/models/caption.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -18,6 +18,9 @@ class Caption(Context):
     #: The position of the ``Caption`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Caption``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
     #: The parent ``Document``.
diff --git a/src/fonduer/parser/models/figure.py b/src/fonduer/parser/models/figure.py
index 416ea02c2..a0981c890 100644
--- a/src/fonduer/parser/models/figure.py
+++ b/src/fonduer/parser/models/figure.py
@@ -19,6 +19,9 @@ class Figure(Context):
     #: The position of the ``Figure`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Figure``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id", ondelete="CASCADE"))
     #: The parent ``Document``.
diff --git a/src/fonduer/parser/models/paragraph.py b/src/fonduer/parser/models/paragraph.py
index d5ffff047..8c5b286f2 100644
--- a/src/fonduer/parser/models/paragraph.py
+++ b/src/fonduer/parser/models/paragraph.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -18,6 +18,9 @@ class Paragraph(Context):
     #: The position of the ``Paragraph`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Paragraph``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
     #: The parent ``Document``.
diff --git a/src/fonduer/parser/models/section.py b/src/fonduer/parser/models/section.py
index d280f4c99..bff0f9d1b 100644
--- a/src/fonduer/parser/models/section.py
+++ b/src/fonduer/parser/models/section.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -17,6 +17,9 @@ class Section(Context):
     #: The unique id of the ``Section``.
     id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)
 
+    #: The name of a ``Section``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The position of the ``Section`` in a ``Document``.
     position = Column(Integer, nullable=False)
 
diff --git a/src/fonduer/parser/models/sentence.py b/src/fonduer/parser/models/sentence.py
index e516e41fd..562ca4914 100644
--- a/src/fonduer/parser/models/sentence.py
+++ b/src/fonduer/parser/models/sentence.py
@@ -261,6 +261,9 @@ class Sentence(
     #: The position of the ``Sentence`` in the ``Document``.
     position = Column(Integer, nullable=False)  # unique sentence number per document
 
+    #: The name of a ``Sentence``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
 
diff --git a/src/fonduer/parser/models/table.py b/src/fonduer/parser/models/table.py
index 10aed30f7..91b91c650 100644
--- a/src/fonduer/parser/models/table.py
+++ b/src/fonduer/parser/models/table.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint
+from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
 from sqlalchemy.orm import backref, relationship
 
 from fonduer.parser.models.context import Context
@@ -18,6 +18,9 @@ class Table(Context):
     #: The position of the ``Table`` in the ``Document``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Table``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Document``.
     document_id = Column(Integer, ForeignKey("document.id"))
     #: The parent ``Document``.
@@ -64,6 +67,9 @@ class Cell(Context):
     #: The position of the ``Cell`` in the ``Table``.
     position = Column(Integer, nullable=False)
 
+    #: The name of a ``Cell``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The id of the parent ``Table``.
     table_id = Column(Integer, ForeignKey("table.id"))
     #: The parent ``Table``.
diff --git a/src/fonduer/parser/models/webpage.py b/src/fonduer/parser/models/webpage.py
index b4e35bfd5..f878112dd 100644
--- a/src/fonduer/parser/models/webpage.py
+++ b/src/fonduer/parser/models/webpage.py
@@ -12,6 +12,10 @@ class Webpage(Document):
     id = Column(
         Integer, ForeignKey("document.id", ondelete="CASCADE"), primary_key=True
     )
+
+    #: The name of a ``Webpage``.
+    name = Column(String, unique=False, nullable=True)
+
     #: The URL of the ``Webpage``.
     url = Column(String)
     #: The host of the ``Webpage``.
diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py
index 145eaeecc..f0ec91539 100644
--- a/src/fonduer/parser/parser.py
+++ b/src/fonduer/parser/parser.py
@@ -276,10 +276,15 @@ def _parse_table(self, node, state):
             stable_id = "{}::{}:{}".format(
                 state["document"].name, "table", state["table"]["idx"]
             )
+
+            # Set name for Table
+            name = node.attrib["name"] if "name" in node.attrib else None
+
             # Create the Table in the DB
             parts = {}
             parts["document"] = state["document"]
             parts["stable_id"] = stable_id
+            parts["name"] = name
             parts["position"] = table_idx
             parent = state["parent"][node]
             if isinstance(parent, Cell):
@@ -342,9 +347,13 @@ def _parse_table(self, node, state):
             ):
                 state["table"][state["parent"][node].position]["grid"][(r, c)] = 1
 
+            # Set name for Cell
+            name = node.attrib["name"] if "name" in node.attrib else None
+
             # construct cell
             parts = defaultdict(list)
             parts["document"] = state["document"]
+            parts["name"] = name
             parts["table"] = state["parent"][node]
             parts["row_start"] = row_start
             parts["row_end"] = row_end
@@ -380,11 +389,14 @@ def _parse_figure(self, node, state):
         if node.tag not in ["img", "figure"]:
             return state
 
-        # Process the figure
+        # Process the Figure
         stable_id = "{}::{}:{}".format(
             state["document"].name, "figure", state["figure"]["idx"]
         )
 
+        # Set name for Figure
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         # img within a Figure get's processed in the parent Figure
         if node.tag == "img" and isinstance(state["parent"][node], Figure):
             return state
@@ -403,6 +415,7 @@ def _parse_figure(self, node, state):
 
         parts["document"] = state["document"]
         parts["stable_id"] = stable_id
+        parts["name"] = name
         parts["position"] = state["figure"]["idx"]
 
         # If processing a raw img
@@ -443,6 +456,10 @@ def _parse_sentence(self, paragraph, node, state):
         """
         text = state["paragraph"]["text"]
         field = state["paragraph"]["field"]
+
+        # Set name for Sentence
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         # Lingual Parse
         document = state["document"]
         for parts in self.tokenize_and_split_sentences(document, text):
@@ -460,6 +477,7 @@ def _parse_sentence(self, paragraph, node, state):
                 state["sentence"]["abs_offset"],
                 abs_sentence_offset_end,
             )
+            parts["name"] = name
             state["sentence"]["abs_offset"] = abs_sentence_offset_end
             if self.structural:
                 context_node = node.getparent() if field == "tail" else node
@@ -549,6 +567,8 @@ def _parse_paragraph(self, node, state):
             if node in state["context"]
             else state["parent"][node]
         )
+        # Set name for Paragraph
+        name = node.attrib["name"] if "name" in node.attrib else None
 
         for field in ["text", "tail"]:
             text = getattr(node, field)
@@ -568,6 +588,7 @@ def _parse_paragraph(self, node, state):
             )
             parts = {}
             parts["stable_id"] = stable_id
+            parts["name"] = name
             parts["document"] = state["document"]
             parts["position"] = state["paragraph"]["idx"]
             if isinstance(parent, Caption):
@@ -617,8 +638,13 @@ def _parse_section(self, node, state):
         stable_id = "{}::{}:{}".format(
             state["document"].name, "section", state["section"]["idx"]
         )
+
+        # Set name for Section
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         state["context"][node] = Section(
             document=state["document"],
+            name=name,
             stable_id=stable_id,
             position=state["section"]["idx"],
         )
@@ -641,12 +667,17 @@ def _parse_caption(self, node, state):
         stable_id = "{}::{}:{}".format(
             state["document"].name, "caption", state["caption"]["idx"]
         )
+
+        # Set name for Section
+        name = node.attrib["name"] if "name" in node.attrib else None
+
         if isinstance(parent, Table):
             state["context"][node] = Caption(
                 document=state["document"],
                 table=parent,
                 figure=None,
                 stable_id=stable_id,
+                name=name,
                 position=state["caption"]["idx"],
             )
         elif isinstance(parent, Figure):
@@ -655,6 +686,7 @@ def _parse_caption(self, node, state):
                 table=None,
                 figure=parent,
                 stable_id=stable_id,
+                name=name,
                 position=state["caption"]["idx"],
             )
         else:

From 4b63fce6821b5015852e5bf66c5faf5eb9829262 Mon Sep 17 00:00:00 2001
From: Sen Wu <senwu@cs.stanford.edu>
Date: Sun, 25 Nov 2018 17:27:45 -0800
Subject: [PATCH 3/6] update webpage

---
 src/fonduer/parser/models/webpage.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/fonduer/parser/models/webpage.py b/src/fonduer/parser/models/webpage.py
index f878112dd..5160602ac 100644
--- a/src/fonduer/parser/models/webpage.py
+++ b/src/fonduer/parser/models/webpage.py
@@ -1,17 +1,15 @@
 from sqlalchemy import Column, ForeignKey, Integer, String
 
-from fonduer.parser.models.document import Document
+from fonduer.parser.models.context import Context
 
 
-class Webpage(Document):
-    """A Webpage document context enhanced with additional metadata."""
+class Webpage(Context):
+    """A Webpage Context enhanced with additional metadata."""
 
     __tablename__ = "webpage"
 
     #: The unique id of the ``Webpage``.
-    id = Column(
-        Integer, ForeignKey("document.id", ondelete="CASCADE"), primary_key=True
-    )
+    id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True)
 
     #: The name of a ``Webpage``.
     name = Column(String, unique=False, nullable=True)

From 25d15a4bb131f57aec3a310f89ba55e0fed5485f Mon Sep 17 00:00:00 2001
From: Sen Wu <senwu@cs.stanford.edu>
Date: Sun, 25 Nov 2018 17:28:31 -0800
Subject: [PATCH 4/6] add section test

---
 tests/data/pure_html/radiology.html | 14 ++++++++++++++
 tests/parser/test_parser.py         | 30 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 tests/data/pure_html/radiology.html

diff --git a/tests/data/pure_html/radiology.html b/tests/data/pure_html/radiology.html
new file mode 100644
index 000000000..1ec632949
--- /dev/null
+++ b/tests/data/pure_html/radiology.html
@@ -0,0 +1,14 @@
+<html>
+    <section name='label'>
+        <p name='label'>0</p>
+    </section>
+    <section name='content'>
+        <p name='COMPARISON'>None.</p>
+        <p name='INDICATION'>Positive TB test</p>
+        <p name='FINDINGS'>The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.</p> <p name='IMPRESSION'>Normal chest x-XXXX.</p>
+    </section>
+    <section name='image'>
+        <img src='CXR1_1_IM-0001-3001.png'/>
+        <img src='/Users/senwu/Desktop/untitled folder/deleteme/test/test/debug/mmm/data/image/CXR1_1_IM-0001-4001.png'/>
+    </section>
+</html>
\ No newline at end of file
diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py
index 28c1bcaca..2fa7b0339 100644
--- a/tests/parser/test_parser.py
+++ b/tests/parser/test_parser.py
@@ -539,3 +539,33 @@ def test_parse_error_doc_skipping(caplog):
     parser_udf = get_parser_udf(structural=True, lingual=True)
     sentence_lists = [x for x in parser_udf.apply(doc)]
     assert len(sentence_lists) == 37
+
+
+def test_parse_multi_sections(caplog):
+    """Test the parser with the radiology document."""
+    caplog.set_level(logging.INFO)
+
+    # Test multi-section html
+    docs_path = "tests/data/pure_html/radiology.html"
+    preprocessor = HTMLDocPreprocessor(docs_path)
+    doc = next(preprocessor._parse_file(docs_path, "radiology"))
+    parser_udf = get_parser_udf(
+        structural=True, tabular=True, lingual=True, visual=False
+    )
+    for _ in parser_udf.apply(doc):
+        pass
+
+    assert len(doc.sections) == 4
+    assert len(doc.paragraphs) == 5
+    assert len(doc.sentences) == 9
+    assert len(doc.figures) == 2
+
+    assert doc.sections[0].name is None
+    assert doc.sections[1].name == "label"
+    assert doc.sections[2].name == "content"
+    assert doc.sections[3].name == "image"
+
+    assert doc.sections[2].paragraphs[0].name == "COMPARISON"
+    assert doc.sections[2].paragraphs[1].name == "INDICATION"
+    assert doc.sections[2].paragraphs[2].name == "FINDINGS"
+    assert doc.sections[2].paragraphs[3].name == "IMPRESSION"

From c942ef2ac17804dbaf7c4b78007266e90ecc3197 Mon Sep 17 00:00:00 2001
From: Sen Wu <senwu@cs.stanford.edu>
Date: Sun, 25 Nov 2018 23:36:17 -0800
Subject: [PATCH 5/6] update CHANGELOG

---
 CHANGELOG.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c02c60edf..f82f14d65 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,13 @@
 [Unreleased]
 ------------
 
+Added
+^^^^^^^
+* `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and
+  add name column for each context in data model.
+  (`#175 <https://github.com/HazyResearch/fonduer/pull/175>`_)
+
+
 [0.3.6] - 2018-11-15
 --------------------
 

From 5041c26761af3b72e00bc85159f6b2d7b6dc60dd Mon Sep 17 00:00:00 2001
From: Luke Hsiao <lwhsiao@stanford.edu>
Date: Mon, 26 Nov 2018 09:59:16 -0800
Subject: [PATCH 6/6] Update CHANGELOG and version

---
 CHANGELOG.rst           | 9 +++++++--
 src/fonduer/_version.py | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f82f14d65..77dc8aae1 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -2,10 +2,15 @@
 ------------
 
 Added
-^^^^^^^
+^^^^^
 * `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and
   add name column for each context in data model.
-  (`#175 <https://github.com/HazyResearch/fonduer/pull/175>`_)
+  (`#182 <https://github.com/HazyResearch/fonduer/pull/182>`_)
+
+Fixed
+^^^^^
+* `@j-rausch`_: Improve error handling for invalid row spans.
+  (`#183 <https://github.com/HazyResearch/fonduer/pull/183>`_)
 
 
 [0.3.6] - 2018-11-15
diff --git a/src/fonduer/_version.py b/src/fonduer/_version.py
index d7b30e121..8879c6c77 100644
--- a/src/fonduer/_version.py
+++ b/src/fonduer/_version.py
@@ -1 +1 @@
-__version__ = "0.3.6"
+__version__ = "0.3.7"