Skip to content

Commit

Permalink
Pipeline: bump to pysbd 0.30.0 to incorporate fix for character spans
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewhead committed Aug 11, 2020
1 parent e76ae07 commit 4fae978
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 43 deletions.
26 changes: 6 additions & 20 deletions data-processing/entities/sentences/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,7 @@ def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
plaintext_to_tex_offset_map[len(plaintext)] = last_segment.tex_end

# Segment the plaintext. Return offsets for each setence relative to the TeX input
segmenter = pysbd.Segmenter(language="en", clean=False)

# Record the current length of the plain text to account for the extractor bug.
length_so_far_in_plain_text = 0
segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)

# As each sentence is scanned, keep track of what sections and environments the
# sentence appears within.
Expand All @@ -168,27 +165,25 @@ def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
in_table = False
in_itemize = False

for i, sentence in enumerate(segmenter.segment(plaintext)):
for i, span in enumerate(segmenter.segment(plaintext)):
# The pysbd module has several open bugs and issues which are addressed below.
# As of 3/23/20 we know the module will fail in the following ways:
# 1. pysbd will not break up the sentence when it starts with a punctuation mark or space.
# ex: ". hello. world. hi."
# sol: check for sentences being longer than 1000 characters.
# 2. pysbd indexes are sometimes incorrectly set
# ex: "hello. world. 1) item one. 2) item two. 3) item three" or "hello!!! world."
# sol: set indexes manually using string search + sentence length
# 3. pysbd uses reserved characters for splitting sentences
# 2. pysbd uses reserved characters for splitting sentences
# ex: see PYSBD_RESERVED_CHARACTERS list.
# sol: throw a warning if the sentence contains any of these characters.
sentence = span.sent
if len(sentence) > 1000:
logging.warning( # pylint: disable=logging-not-lazy
"Exceptionally long sentence (length %d). This might indicate the sentence "
+ "extractor failed to properly split text into sentences.",
len(sentence),
)

plaintext_start = plaintext.find(sentence, length_so_far_in_plain_text)
plaintext_end = plaintext_start + len(sentence)
plaintext_start = span.start
plaintext_end = span.end
if (
plaintext_start not in plaintext_to_tex_offset_map
or plaintext_end not in plaintext_to_tex_offset_map
Expand All @@ -199,18 +194,9 @@ def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
sentence,
)
continue
if plaintext_start - 500 > length_so_far_in_plain_text:
logging.warning( # pylint: disable=logging-not-lazy
"Sentence boundary start for sentence %s was %d characters ahead of "
+ "the previous sentence, this might indicate the sentence extractor "
+ "failed to properly split text.",
sentence,
plaintext_start - length_so_far_in_plain_text,
)

start = plaintext_to_tex_offset_map[plaintext_start]
end = plaintext_to_tex_offset_map[plaintext_end]
length_so_far_in_plain_text = plaintext_end
sentence_tex = tex[start:end]

# Extract TeX around sentence to understand the environment in which it appears
Expand Down
2 changes: 1 addition & 1 deletion data-processing/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ numpy
opencv-python==3.4.8.29
peewee
psycopg2-binary
pysbd
pysbd==0.3.0
reportlab
requests
TexSoup==0.2.0
Expand Down
12 changes: 11 additions & 1 deletion data-processing/tests/test_colorize_sentences.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,18 @@ def create_sentence(tex: str, start: int) -> Sentence:
tex_path="main.tex",
id_=0,
tex=tex,
context_tex="<extracted text>",
context_tex="<extracted tex>",
text="<plaintext>",
sanitized_text="<sanitized>",
section_name="<current section>",
in_figure=False,
in_table=False,
in_itemize=False,
label=[],
ref=[],
cite=[],
url=[],
others=[],
)


Expand Down
18 changes: 7 additions & 11 deletions data-processing/tests/test_colorize_tex.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
import re

from common.colorize_tex import (
COLOR_MACROS,
COLOR_MACROS_BASE_MACROS,
COLOR_MACROS_LATEX_IMPORTS,
COLOR_MACROS_TEX_IMPORTS,
add_color_macros,
colorize_citations,
colorize_entities,
colorize_equation_tokens,
)
from common.types import Equation, FileContents, SerializableEntity, SerializableToken
from common.colorize_tex import (COLOR_MACROS, COLOR_MACROS_BASE_MACROS,
COLOR_MACROS_LATEX_IMPORTS,
COLOR_MACROS_TEX_IMPORTS, add_color_macros,
colorize_citations, colorize_entities,
colorize_equation_tokens)
from common.types import (Equation, FileContents, SerializableEntity,
SerializableToken)

COLOR_PATTERN = (
r"\\scholarsetcolor\[rgb\]{[0-9.]+,[0-9.]+,[0-9.]+}"
Expand Down
20 changes: 10 additions & 10 deletions data-processing/tests/test_parse_tex.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ def test_extract_sentences():

sentence1 = sentences[0]
assert sentence1.start == 0
assert sentence1.end == 40
assert sentence1.end == 41
assert sentences[0].text == "This is the first argsentence."

sentence2 = sentences[1]
assert sentence2.start == 41
assert sentence2.end == 69
assert sentence2.end == 70
assert sentences[1].text == "This is the second sentence."


Expand All @@ -91,7 +91,7 @@ def test_sentence_splitting_end_points():
)

assert len(sentences) == 4
sentence_end_points = [[0, 19], [20, 47], [48, 66], [67, 86]]
sentence_end_points = [[0, 20], [20, 48], [48, 67], [67, 86]]
for i, [start, end] in enumerate(sentence_end_points):
assert sentences[i].start == start
assert sentences[i].end == end
Expand All @@ -107,13 +107,13 @@ def test_sentence_splitting_end_points_and_more_text():
)
assert len(sentences) == 8
sentence_end_points = [
[0, 14],
[15, 25],
[26, 31],
[32, 40],
[41, 50],
[51, 60],
[61, 76],
[0, 15],
[15, 26],
[26, 32],
[32, 41],
[41, 51],
[51, 61],
[61, 77],
[77, 83],
]
for i, [start, end] in enumerate(sentence_end_points):
Expand Down

0 comments on commit 4fae978

Please sign in to comment.