Skip to content

Commit

Permalink
Fix not-contraction offsets (#15)
Browse files Browse the repository at this point in the history
* fix not-contraction offsets + add test
* do not differ in offset calculation when using replace_not_contract=True or False
  • Loading branch information
KDercksen authored Dec 17, 2021
1 parent 10ee9b1 commit 9d217f6
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 1 deletion.
2 changes: 1 addition & 1 deletion syntok/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def _produce_separator_split_token(
yield Token(prefix, word[remainder:mo.start() - 1], offset + remainder)
prefix = ""

yield Token(prefix, "not" if self.replace_not_contraction else 'n' + mo.group(0), offset + mo.start())
yield Token(prefix, "not" if self.replace_not_contraction else 'n' + mo.group(0), offset + mo.start() - 1)
return ""

yield Token(prefix, word[remainder:mo.start()], offset + remainder)
Expand Down
17 changes: 17 additions & 0 deletions syntok/tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,23 @@ def test_nonword_high_prefix(self):
self.assertListEqual(s(result), ["\U0001F64C", ".", "A"])
self.assertListEqual([t.offset for t in result], [0, 1, 2]) # requires Py3.3+

def test_apostrophe_offset_without_replace_not_contraction(self):
# NOTE: in this case nothing is replaced, so the offsets should remain identical
# to those in the original text
text = "don't"
self.tokenizer = Tokenizer(replace_not_contraction=False)
result = self.tokenizer.split(text)
self.assertListEqual([t.offset for t in result], [0, 2])

def test_apostrophe_offset_with_replace_not_contraction(self):
# NOTE: in this case, "n't" is replaced with "not", so a space is introduced.
# e.g. "don't" -> "do not", "can't" -> "can not"
text = "don't"
self.tokenizer = Tokenizer(replace_not_contraction=True)
result = self.tokenizer.split(text)
self.assertListEqual([t.offset for t in result], [0, 2])
self.assertListEqual([t.value for t in result], ["do", "not"])


class TestToText(TestCase):

Expand Down

0 comments on commit 9d217f6

Please sign in to comment.