Pipeline: bump to pysbd 0.30.0 to incorporate fix for character spans

allenai · Aug 11, 2020 · 4fae978 · 4fae978
1 parent e76ae07
commit 4fae978
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 43 deletions.
diff --git a/data-processing/entities/sentences/extractor.py b/data-processing/entities/sentences/extractor.py
@@ -156,10 +156,7 @@ def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
             plaintext_to_tex_offset_map[len(plaintext)] = last_segment.tex_end
 
         # Segment the plaintext. Return offsets for each setence relative to the TeX input
-        segmenter = pysbd.Segmenter(language="en", clean=False)
-
-        # Record the current length of the plain text to account for the extractor bug.
-        length_so_far_in_plain_text = 0
+        segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)
 
         # As each sentence is scanned, keep track of what sections and environments the
         # sentence appears within.
@@ -168,27 +165,25 @@ def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
         in_table = False
         in_itemize = False
 
-        for i, sentence in enumerate(segmenter.segment(plaintext)):
+        for i, span in enumerate(segmenter.segment(plaintext)):
             # The pysbd module has several open bugs and issues which are addressed below.
             # As of 3/23/20 we know the module will fail in the following ways:
             # 1. pysbd will not break up the sentence when it starts with a punctuation mark or space.
             #    ex: ". hello. world. hi."
             #    sol: check for sentences being longer than 1000 characters.
-            # 2. pysbd indexes are sometimes incorrectly set
-            #    ex: "hello. world. 1) item one. 2) item two. 3) item three" or "hello!!! world."
-            #    sol: set indexes manually using string search + sentence length
-            # 3. pysbd uses reserved characters for splitting sentences
+            # 2. pysbd uses reserved characters for splitting sentences
             #    ex: see PYSBD_RESERVED_CHARACTERS list.
             #    sol: throw a warning if the sentence contains any of these characters.
+            sentence = span.sent
             if len(sentence) > 1000:
                 logging.warning(  # pylint: disable=logging-not-lazy
                     "Exceptionally long sentence (length %d). This might indicate the sentence "
                     + "extractor failed to properly split text into sentences.",
                     len(sentence),
                 )
 
-            plaintext_start = plaintext.find(sentence, length_so_far_in_plain_text)
-            plaintext_end = plaintext_start + len(sentence)
+            plaintext_start = span.start
+            plaintext_end = span.end
             if (
                 plaintext_start not in plaintext_to_tex_offset_map
                 or plaintext_end not in plaintext_to_tex_offset_map
@@ -199,18 +194,9 @@ def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
                     sentence,
                 )
                 continue
-            if plaintext_start - 500 > length_so_far_in_plain_text:
-                logging.warning(  # pylint: disable=logging-not-lazy
-                    "Sentence boundary start for sentence %s was %d characters ahead of "
-                    + "the previous sentence, this might indicate the sentence extractor "
-                    + "failed to properly split text.",
-                    sentence,
-                    plaintext_start - length_so_far_in_plain_text,
-                )
 
             start = plaintext_to_tex_offset_map[plaintext_start]
             end = plaintext_to_tex_offset_map[plaintext_end]
-            length_so_far_in_plain_text = plaintext_end
             sentence_tex = tex[start:end]
 
             # Extract TeX around sentence to understand the environment in which it appears

diff --git a/data-processing/requirements.txt b/data-processing/requirements.txt
@@ -5,7 +5,7 @@ numpy
 opencv-python==3.4.8.29
 peewee
 psycopg2-binary
-pysbd
+pysbd==0.3.0
 reportlab
 requests
 TexSoup==0.2.0

diff --git a/data-processing/tests/test_colorize_sentences.py b/data-processing/tests/test_colorize_sentences.py
@@ -9,8 +9,18 @@ def create_sentence(tex: str, start: int) -> Sentence:
         tex_path="main.tex",
         id_=0,
         tex=tex,
-        context_tex="<extracted text>",
+        context_tex="<extracted tex>",
         text="<plaintext>",
+        sanitized_text="<sanitized>",
+        section_name="<current section>",
+        in_figure=False,
+        in_table=False,
+        in_itemize=False,
+        label=[],
+        ref=[],
+        cite=[],
+        url=[],
+        others=[],
     )
 
 

diff --git a/data-processing/tests/test_colorize_tex.py b/data-processing/tests/test_colorize_tex.py
@@ -1,16 +1,12 @@
 import re
 
-from common.colorize_tex import (
-    COLOR_MACROS,
-    COLOR_MACROS_BASE_MACROS,
-    COLOR_MACROS_LATEX_IMPORTS,
-    COLOR_MACROS_TEX_IMPORTS,
-    add_color_macros,
-    colorize_citations,
-    colorize_entities,
-    colorize_equation_tokens,
-)
-from common.types import Equation, FileContents, SerializableEntity, SerializableToken
+from common.colorize_tex import (COLOR_MACROS, COLOR_MACROS_BASE_MACROS,
+                                 COLOR_MACROS_LATEX_IMPORTS,
+                                 COLOR_MACROS_TEX_IMPORTS, add_color_macros,
+                                 colorize_citations, colorize_entities,
+                                 colorize_equation_tokens)
+from common.types import (Equation, FileContents, SerializableEntity,
+                          SerializableToken)
 
 COLOR_PATTERN = (
     r"\\scholarsetcolor\[rgb\]{[0-9.]+,[0-9.]+,[0-9.]+}"

diff --git a/data-processing/tests/test_parse_tex.py b/data-processing/tests/test_parse_tex.py
@@ -63,12 +63,12 @@ def test_extract_sentences():
 
     sentence1 = sentences[0]
     assert sentence1.start == 0
-    assert sentence1.end == 40
+    assert sentence1.end == 41
     assert sentences[0].text == "This is the first argsentence."
 
     sentence2 = sentences[1]
     assert sentence2.start == 41
-    assert sentence2.end == 69
+    assert sentence2.end == 70
     assert sentences[1].text == "This is the second sentence."
 
 
@@ -91,7 +91,7 @@ def test_sentence_splitting_end_points():
     )
 
     assert len(sentences) == 4
-    sentence_end_points = [[0, 19], [20, 47], [48, 66], [67, 86]]
+    sentence_end_points = [[0, 20], [20, 48], [48, 67], [67, 86]]
     for i, [start, end] in enumerate(sentence_end_points):
         assert sentences[i].start == start
         assert sentences[i].end == end
@@ -107,13 +107,13 @@ def test_sentence_splitting_end_points_and_more_text():
     )
     assert len(sentences) == 8
     sentence_end_points = [
-        [0, 14],
-        [15, 25],
-        [26, 31],
-        [32, 40],
-        [41, 50],
-        [51, 60],
-        [61, 76],
+        [0, 15],
+        [15, 26],
+        [26, 32],
+        [32, 41],
+        [41, 51],
+        [51, 61],
+        [61, 77],
         [77, 83],
     ]
     for i, [start, end] in enumerate(sentence_end_points):