From 9bd018e6e72c12373ee37499454eda64cc258727 Mon Sep 17 00:00:00 2001 From: Sen Wu Date: Sun, 25 Nov 2018 15:04:47 -0800 Subject: [PATCH 1/6] support multiple sections --- src/fonduer/parser/parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py index a6bbb607c..145eaeecc 100644 --- a/src/fonduer/parser/parser.py +++ b/src/fonduer/parser/parser.py @@ -603,14 +603,14 @@ def _parse_paragraph(self, node, state): def _parse_section(self, node, state): """Parse a Section of the node. - Note that this implementation currently just creates a single Section - for a document. + Note that this implementation currently creates a Section at the + beginning of the document and creates Section based on tag of node. :param node: The lxml node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ - if node.tag != "html": + if node.tag not in ["html", "section"]: return state # Add a Section From 087ce983af5b30b227b430f025f1e277d63aa92b Mon Sep 17 00:00:00 2001 From: Sen Wu Date: Sun, 25 Nov 2018 15:09:18 -0800 Subject: [PATCH 2/6] add name column in data model --- src/fonduer/parser/models/caption.py | 5 +++- src/fonduer/parser/models/figure.py | 3 +++ src/fonduer/parser/models/paragraph.py | 5 +++- src/fonduer/parser/models/section.py | 5 +++- src/fonduer/parser/models/sentence.py | 3 +++ src/fonduer/parser/models/table.py | 8 +++++- src/fonduer/parser/models/webpage.py | 4 +++ src/fonduer/parser/parser.py | 34 +++++++++++++++++++++++++- 8 files changed, 62 insertions(+), 5 deletions(-) diff --git a/src/fonduer/parser/models/caption.py b/src/fonduer/parser/models/caption.py index aab1aaa17..9750b82bd 100644 --- a/src/fonduer/parser/models/caption.py +++ b/src/fonduer/parser/models/caption.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint +from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint from sqlalchemy.orm import backref, relationship from fonduer.parser.models.context import Context @@ -18,6 +18,9 @@ class Caption(Context): #: The position of the ``Caption`` in the ``Document``. position = Column(Integer, nullable=False) + #: The name of a ``Caption``. + name = Column(String, unique=False, nullable=True) + #: The id of the parent ``Document``. document_id = Column(Integer, ForeignKey("document.id")) #: The parent ``Document``. diff --git a/src/fonduer/parser/models/figure.py b/src/fonduer/parser/models/figure.py index 416ea02c2..a0981c890 100644 --- a/src/fonduer/parser/models/figure.py +++ b/src/fonduer/parser/models/figure.py @@ -19,6 +19,9 @@ class Figure(Context): #: The position of the ``Figure`` in the ``Document``. position = Column(Integer, nullable=False) + #: The name of a ``Figure``. + name = Column(String, unique=False, nullable=True) + #: The id of the parent ``Document``. document_id = Column(Integer, ForeignKey("document.id", ondelete="CASCADE")) #: The parent ``Document``. diff --git a/src/fonduer/parser/models/paragraph.py b/src/fonduer/parser/models/paragraph.py index d5ffff047..8c5b286f2 100644 --- a/src/fonduer/parser/models/paragraph.py +++ b/src/fonduer/parser/models/paragraph.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint +from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint from sqlalchemy.orm import backref, relationship from fonduer.parser.models.context import Context @@ -18,6 +18,9 @@ class Paragraph(Context): #: The position of the ``Paragraph`` in the ``Document``. position = Column(Integer, nullable=False) + #: The name of a ``Paragraph``. + name = Column(String, unique=False, nullable=True) + #: The id of the parent ``Document``. document_id = Column(Integer, ForeignKey("document.id")) #: The parent ``Document``. diff --git a/src/fonduer/parser/models/section.py b/src/fonduer/parser/models/section.py index d280f4c99..bff0f9d1b 100644 --- a/src/fonduer/parser/models/section.py +++ b/src/fonduer/parser/models/section.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint +from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint from sqlalchemy.orm import backref, relationship from fonduer.parser.models.context import Context @@ -17,6 +17,9 @@ class Section(Context): #: The unique id of the ``Section``. id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True) + #: The name of a ``Section``. + name = Column(String, unique=False, nullable=True) + #: The position of the ``Section`` in a ``Document``. position = Column(Integer, nullable=False) diff --git a/src/fonduer/parser/models/sentence.py b/src/fonduer/parser/models/sentence.py index e516e41fd..562ca4914 100644 --- a/src/fonduer/parser/models/sentence.py +++ b/src/fonduer/parser/models/sentence.py @@ -261,6 +261,9 @@ class Sentence( #: The position of the ``Sentence`` in the ``Document``. position = Column(Integer, nullable=False) # unique sentence number per document + #: The name of a ``Sentence``. + name = Column(String, unique=False, nullable=True) + #: The id of the parent ``Document``. document_id = Column(Integer, ForeignKey("document.id")) diff --git a/src/fonduer/parser/models/table.py b/src/fonduer/parser/models/table.py index 10aed30f7..91b91c650 100644 --- a/src/fonduer/parser/models/table.py +++ b/src/fonduer/parser/models/table.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint +from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint from sqlalchemy.orm import backref, relationship from fonduer.parser.models.context import Context @@ -18,6 +18,9 @@ class Table(Context): #: The position of the ``Table`` in the ``Document``. position = Column(Integer, nullable=False) + #: The name of a ``Table``. + name = Column(String, unique=False, nullable=True) + #: The id of the parent ``Document``. document_id = Column(Integer, ForeignKey("document.id")) #: The parent ``Document``. @@ -64,6 +67,9 @@ class Cell(Context): #: The position of the ``Cell`` in the ``Table``. position = Column(Integer, nullable=False) + #: The name of a ``Cell``. + name = Column(String, unique=False, nullable=True) + #: The id of the parent ``Table``. table_id = Column(Integer, ForeignKey("table.id")) #: The parent ``Table``. diff --git a/src/fonduer/parser/models/webpage.py b/src/fonduer/parser/models/webpage.py index b4e35bfd5..f878112dd 100644 --- a/src/fonduer/parser/models/webpage.py +++ b/src/fonduer/parser/models/webpage.py @@ -12,6 +12,10 @@ class Webpage(Document): id = Column( Integer, ForeignKey("document.id", ondelete="CASCADE"), primary_key=True ) + + #: The name of a ``Webpage``. + name = Column(String, unique=False, nullable=True) + #: The URL of the ``Webpage``. url = Column(String) #: The host of the ``Webpage``. diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py index 145eaeecc..f0ec91539 100644 --- a/src/fonduer/parser/parser.py +++ b/src/fonduer/parser/parser.py @@ -276,10 +276,15 @@ def _parse_table(self, node, state): stable_id = "{}::{}:{}".format( state["document"].name, "table", state["table"]["idx"] ) + + # Set name for Table + name = node.attrib["name"] if "name" in node.attrib else None + # Create the Table in the DB parts = {} parts["document"] = state["document"] parts["stable_id"] = stable_id + parts["name"] = name parts["position"] = table_idx parent = state["parent"][node] if isinstance(parent, Cell): @@ -342,9 +347,13 @@ def _parse_table(self, node, state): ): state["table"][state["parent"][node].position]["grid"][(r, c)] = 1 + # Set name for Cell + name = node.attrib["name"] if "name" in node.attrib else None + # construct cell parts = defaultdict(list) parts["document"] = state["document"] + parts["name"] = name parts["table"] = state["parent"][node] parts["row_start"] = row_start parts["row_end"] = row_end @@ -380,11 +389,14 @@ def _parse_figure(self, node, state): if node.tag not in ["img", "figure"]: return state - # Process the figure + # Process the Figure stable_id = "{}::{}:{}".format( state["document"].name, "figure", state["figure"]["idx"] ) + # Set name for Figure + name = node.attrib["name"] if "name" in node.attrib else None + # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state @@ -403,6 +415,7 @@ def _parse_figure(self, node, state): parts["document"] = state["document"] parts["stable_id"] = stable_id + parts["name"] = name parts["position"] = state["figure"]["idx"] # If processing a raw img @@ -443,6 +456,10 @@ def _parse_sentence(self, paragraph, node, state): """ text = state["paragraph"]["text"] field = state["paragraph"]["field"] + + # Set name for Sentence + name = node.attrib["name"] if "name" in node.attrib else None + # Lingual Parse document = state["document"] for parts in self.tokenize_and_split_sentences(document, text): @@ -460,6 +477,7 @@ def _parse_sentence(self, paragraph, node, state): state["sentence"]["abs_offset"], abs_sentence_offset_end, ) + parts["name"] = name state["sentence"]["abs_offset"] = abs_sentence_offset_end if self.structural: context_node = node.getparent() if field == "tail" else node @@ -549,6 +567,8 @@ def _parse_paragraph(self, node, state): if node in state["context"] else state["parent"][node] ) + # Set name for Paragraph + name = node.attrib["name"] if "name" in node.attrib else None for field in ["text", "tail"]: text = getattr(node, field) @@ -568,6 +588,7 @@ def _parse_paragraph(self, node, state): ) parts = {} parts["stable_id"] = stable_id + parts["name"] = name parts["document"] = state["document"] parts["position"] = state["paragraph"]["idx"] if isinstance(parent, Caption): @@ -617,8 +638,13 @@ def _parse_section(self, node, state): stable_id = "{}::{}:{}".format( state["document"].name, "section", state["section"]["idx"] ) + + # Set name for Section + name = node.attrib["name"] if "name" in node.attrib else None + state["context"][node] = Section( document=state["document"], + name=name, stable_id=stable_id, position=state["section"]["idx"], ) @@ -641,12 +667,17 @@ def _parse_caption(self, node, state): stable_id = "{}::{}:{}".format( state["document"].name, "caption", state["caption"]["idx"] ) + + # Set name for Section + name = node.attrib["name"] if "name" in node.attrib else None + if isinstance(parent, Table): state["context"][node] = Caption( document=state["document"], table=parent, figure=None, stable_id=stable_id, + name=name, position=state["caption"]["idx"], ) elif isinstance(parent, Figure): @@ -655,6 +686,7 @@ def _parse_caption(self, node, state): table=None, figure=parent, stable_id=stable_id, + name=name, position=state["caption"]["idx"], ) else: From 4b63fce6821b5015852e5bf66c5faf5eb9829262 Mon Sep 17 00:00:00 2001 From: Sen Wu Date: Sun, 25 Nov 2018 17:27:45 -0800 Subject: [PATCH 3/6] update webpage --- src/fonduer/parser/models/webpage.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/fonduer/parser/models/webpage.py b/src/fonduer/parser/models/webpage.py index f878112dd..5160602ac 100644 --- a/src/fonduer/parser/models/webpage.py +++ b/src/fonduer/parser/models/webpage.py @@ -1,17 +1,15 @@ from sqlalchemy import Column, ForeignKey, Integer, String -from fonduer.parser.models.document import Document +from fonduer.parser.models.context import Context -class Webpage(Document): - """A Webpage document context enhanced with additional metadata.""" +class Webpage(Context): + """A Webpage Context enhanced with additional metadata.""" __tablename__ = "webpage" #: The unique id of the ``Webpage``. - id = Column( - Integer, ForeignKey("document.id", ondelete="CASCADE"), primary_key=True - ) + id = Column(Integer, ForeignKey("context.id", ondelete="CASCADE"), primary_key=True) #: The name of a ``Webpage``. name = Column(String, unique=False, nullable=True) From 25d15a4bb131f57aec3a310f89ba55e0fed5485f Mon Sep 17 00:00:00 2001 From: Sen Wu Date: Sun, 25 Nov 2018 17:28:31 -0800 Subject: [PATCH 4/6] add section test --- tests/data/pure_html/radiology.html | 14 ++++++++++++++ tests/parser/test_parser.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 tests/data/pure_html/radiology.html diff --git a/tests/data/pure_html/radiology.html b/tests/data/pure_html/radiology.html new file mode 100644 index 000000000..1ec632949 --- /dev/null +++ b/tests/data/pure_html/radiology.html @@ -0,0 +1,14 @@ + +
+

0

+
+
+

None.

+

Positive TB test

+

The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.

Normal chest x-XXXX.

+
+
+ + +
+ \ No newline at end of file diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py index 28c1bcaca..2fa7b0339 100644 --- a/tests/parser/test_parser.py +++ b/tests/parser/test_parser.py @@ -539,3 +539,33 @@ def test_parse_error_doc_skipping(caplog): parser_udf = get_parser_udf(structural=True, lingual=True) sentence_lists = [x for x in parser_udf.apply(doc)] assert len(sentence_lists) == 37 + + +def test_parse_multi_sections(caplog): + """Test the parser with the radiology document.""" + caplog.set_level(logging.INFO) + + # Test multi-section html + docs_path = "tests/data/pure_html/radiology.html" + preprocessor = HTMLDocPreprocessor(docs_path) + doc = next(preprocessor._parse_file(docs_path, "radiology")) + parser_udf = get_parser_udf( + structural=True, tabular=True, lingual=True, visual=False + ) + for _ in parser_udf.apply(doc): + pass + + assert len(doc.sections) == 4 + assert len(doc.paragraphs) == 5 + assert len(doc.sentences) == 9 + assert len(doc.figures) == 2 + + assert doc.sections[0].name is None + assert doc.sections[1].name == "label" + assert doc.sections[2].name == "content" + assert doc.sections[3].name == "image" + + assert doc.sections[2].paragraphs[0].name == "COMPARISON" + assert doc.sections[2].paragraphs[1].name == "INDICATION" + assert doc.sections[2].paragraphs[2].name == "FINDINGS" + assert doc.sections[2].paragraphs[3].name == "IMPRESSION" From c942ef2ac17804dbaf7c4b78007266e90ecc3197 Mon Sep 17 00:00:00 2001 From: Sen Wu Date: Sun, 25 Nov 2018 23:36:17 -0800 Subject: [PATCH 5/6] update CHANGELOG --- CHANGELOG.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c02c60edf..f82f14d65 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,13 @@ [Unreleased] ------------ +Added +^^^^^^^ +* `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and + add name column for each context in data model. + (`#175 `_) + + [0.3.6] - 2018-11-15 -------------------- From 5041c26761af3b72e00bc85159f6b2d7b6dc60dd Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Mon, 26 Nov 2018 09:59:16 -0800 Subject: [PATCH 6/6] Update CHANGELOG and version --- CHANGELOG.rst | 9 +++++++-- src/fonduer/_version.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f82f14d65..77dc8aae1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,10 +2,15 @@ ------------ Added -^^^^^^^ +^^^^^ * `@senwu`_: Add support to parse multiple sections in parser, fix webpage context, and add name column for each context in data model. - (`#175 `_) + (`#182 `_) + +Fixed +^^^^^ +* `@j-rausch`_: Improve error handling for invalid row spans. + (`#183 `_) [0.3.6] - 2018-11-15 diff --git a/src/fonduer/_version.py b/src/fonduer/_version.py index d7b30e121..8879c6c77 100644 --- a/src/fonduer/_version.py +++ b/src/fonduer/_version.py @@ -1 +1 @@ -__version__ = "0.3.6" +__version__ = "0.3.7"