Skip to content

Commit

Permalink
🎨 ♻️ Update sentence char_span logic
Browse files Browse the repository at this point in the history
account for same text with different char_span
  • Loading branch information
nipunsadvilkar committed Jul 26, 2020
1 parent 29d9d12 commit 0e364f4
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 12 deletions.
1 change: 0 additions & 1 deletion pysbd/lang/common/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
class Common(object):

# added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc.
# TODO: above special cases group can be updated as per developer needs
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"

# # Rubular: http://rubular.com/r/NqCqv372Ix
Expand Down
28 changes: 19 additions & 9 deletions pysbd/segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,23 @@ def processor(self, text):

def sentences_with_char_spans(self, sentences):
# since SENTENCE_BOUNDARY_REGEX doesnt account
# for trailing whitespaces \s* is used as suffix
# for trailing whitespaces \s* & is used as suffix
# to keep non-destructive text after segments joins
sent_spans = set((match.group(), match.start(), match.end()) for sent in sentences
for match in re.finditer('{0}\s*'.format(re.escape(sent)),
self.original_text))
sorted_spans = sorted(sent_spans, key=lambda x: x[1])
return [TextSpan(sent, start, end) for sent, start, end in sorted_spans]
sent_spans = []
prior_start_char_idx = 0
for sent in sentences:
for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text):
match_str = match.group()
match_start_idx, match_end_idx = match.span()
if match_start_idx >= prior_start_char_idx:
# making sure if curren sentence and its span
# is either first sentence along with its char spans
# or current sent spans adjacent to prior sentence spans
sent_spans.append(
TextSpan(match_str, match_start_idx, match_end_idx))
prior_start_char_idx = match_start_idx
break
return sent_spans

def segment(self, text):
self.original_text = text
Expand All @@ -68,11 +78,11 @@ def segment(self, text):
text = self.cleaner(text).clean()
postprocessed_sents = self.processor(text).process()
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
if self.clean:
if self.char_span:
return sentence_w_char_spans
elif self.clean:
# clean and destructed sentences
return postprocessed_sents
elif self.char_span:
return sentence_w_char_spans
else:
# nondestructive with whitespaces
return [textspan.sent for textspan in sentence_w_char_spans]
4 changes: 2 additions & 2 deletions pysbd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def __init__(self, sent, start, end):
self.end = end

def __repr__(self): # pragma: no cover
return "{0}(sent='{1}', start={2}, end={3})".format(
self.__class__.__name__, self.sent, self.start, self.end)
return "{0}(sent={1}, start={2}, end={3})".format(
self.__class__.__name__, repr(self.sent), self.start, self.end)

def __eq__(self, other):
if isinstance(self, other.__class__):
Expand Down
26 changes: 26 additions & 0 deletions tests/test_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,32 @@ def test_sbd_char_span(en_no_clean_with_span_fixture, text, expected):
# clubbing sentences and matching with original text
assert text == "".join([seg.sent for seg in segments])

def test_same_sentence_different_char_span(en_no_clean_with_span_fixture):
"""Test same sentences with different char offsets & check for non-destruction"""
text = """From the AP comes this story :
President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.
***
After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say?
***
"THE PRESIDENT: I appreciate that.
(Laughter.)
My life is too complicated right now trying to do my job.
(Laughter.)"""
expected_text_spans = [TextSpan(sent='From the AP comes this story :\n', start=0, end=31),
TextSpan(sent='President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.\n', start=31, end=153),
TextSpan(sent='***\n', start=153, end=157),
TextSpan(sent='After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say?\n', start=157, end=284),
TextSpan(sent='***\n', start=284, end=288),
TextSpan(sent='"THE PRESIDENT: I appreciate that.\n', start=288, end=323),
TextSpan(sent='(Laughter.)\n', start=323, end=335),
TextSpan(sent='My life is too complicated right now trying to do my job.\n', start=335, end=393),
TextSpan(sent='(Laughter.)', start=393, end=404)]
segments_w_spans = en_no_clean_with_span_fixture.segment(text)
assert segments_w_spans == expected_text_spans
# check for non-destruction
# clubbing sentences and matching with original text
assert text == "".join([seg.sent for seg in segments_w_spans])

def test_exception_with_both_clean_and_span_true():
"""Test to not allow clean=True and char_span=True
"""
Expand Down

0 comments on commit 0e364f4

Please sign in to comment.