🎨 ♻️ Update sentence char_span logic

account for same text with different char_span
nipunsadvilkar · Jul 26, 2020 · 0e364f4 · 0e364f4
1 parent 29d9d12
commit 0e364f4
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 12 deletions.
diff --git a/pysbd/lang/common/common.py b/pysbd/lang/common/common.py
@@ -5,7 +5,6 @@
 class Common(object):
 
     # added special case: r"[。．.！!?].*" to handle intermittent dots, exclamation, etc.
-    # TODO: above special cases group can be updated as per developer needs
     SENTENCE_BOUNDARY_REGEX = r"（(?:[^）])*）(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。．.！!?？].*|\S.*?[。．.！!?？ȸȹ☉☈☇☄]"
 
     # # Rubular: http://rubular.com/r/NqCqv372Ix

diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py
@@ -49,13 +49,23 @@ def processor(self, text):
 
     def sentences_with_char_spans(self, sentences):
         # since SENTENCE_BOUNDARY_REGEX doesnt account
-        # for trailing whitespaces \s* is used as suffix
+        # for trailing whitespaces \s* & is used as suffix
         # to keep non-destructive text after segments joins
-        sent_spans = set((match.group(), match.start(), match.end()) for sent in sentences
-                for match in re.finditer('{0}\s*'.format(re.escape(sent)),
-                self.original_text))
-        sorted_spans = sorted(sent_spans, key=lambda x: x[1])
-        return [TextSpan(sent, start, end) for sent, start, end in sorted_spans]
+        sent_spans = []
+        prior_start_char_idx = 0
+        for sent in sentences:
+            for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text):
+                match_str = match.group()
+                match_start_idx, match_end_idx = match.span()
+                if match_start_idx >= prior_start_char_idx:
+                    # making sure if curren sentence and its span
+                    # is either first sentence along with its char spans
+                    # or current sent spans adjacent to prior sentence spans
+                    sent_spans.append(
+                        TextSpan(match_str, match_start_idx, match_end_idx))
+                    prior_start_char_idx = match_start_idx
+                    break
+        return sent_spans
 
     def segment(self, text):
         self.original_text = text
@@ -68,11 +78,11 @@ def segment(self, text):
             text = self.cleaner(text).clean()
         postprocessed_sents = self.processor(text).process()
         sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
-        if self.clean:
+        if self.char_span:
+            return sentence_w_char_spans
+        elif self.clean:
             # clean and destructed sentences
             return postprocessed_sents
-        elif self.char_span:
-            return sentence_w_char_spans
         else:
             # nondestructive with whitespaces
             return [textspan.sent for textspan in sentence_w_char_spans]
diff --git a/pysbd/utils.py b/pysbd/utils.py
@@ -56,8 +56,8 @@ def __init__(self, sent, start, end):
         self.end = end
 
     def __repr__(self):  # pragma: no cover
-        return "{0}(sent='{1}', start={2}, end={3})".format(
-            self.__class__.__name__, self.sent, self.start, self.end)
+        return "{0}(sent={1}, start={2}, end={3})".format(
+            self.__class__.__name__, repr(self.sent), self.start, self.end)
 
     def __eq__(self, other):
         if isinstance(self, other.__class__):

diff --git a/tests/test_segmenter.py b/tests/test_segmenter.py
@@ -37,6 +37,32 @@ def test_sbd_char_span(en_no_clean_with_span_fixture, text, expected):
     # clubbing sentences and matching with original text
     assert text == "".join([seg.sent for seg in segments])
 
+def test_same_sentence_different_char_span(en_no_clean_with_span_fixture):
+    """Test same sentences with different char offsets & check for non-destruction"""
+    text = """From the AP comes this story :
+President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.
+***
+After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say?
+***
+"THE PRESIDENT: I appreciate that.
+(Laughter.)
+My life is too complicated right now trying to do my job.
+(Laughter.)"""
+    expected_text_spans = [TextSpan(sent='From the AP comes this story :\n', start=0, end=31),
+    TextSpan(sent='President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.\n', start=31, end=153),
+    TextSpan(sent='***\n', start=153, end=157),
+    TextSpan(sent='After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say?\n', start=157, end=284),
+    TextSpan(sent='***\n', start=284, end=288),
+    TextSpan(sent='"THE PRESIDENT: I appreciate that.\n', start=288, end=323),
+    TextSpan(sent='(Laughter.)\n', start=323, end=335),
+    TextSpan(sent='My life is too complicated right now trying to do my job.\n', start=335, end=393),
+    TextSpan(sent='(Laughter.)', start=393, end=404)]
+    segments_w_spans = en_no_clean_with_span_fixture.segment(text)
+    assert segments_w_spans == expected_text_spans
+    # check for non-destruction
+    # clubbing sentences and matching with original text
+    assert text == "".join([seg.sent for seg in segments_w_spans])
+
 def test_exception_with_both_clean_and_span_true():
     """Test to not allow clean=True and char_span=True
     """