Skip to content

Commit

Permalink
fix ja leading spaces (#5969)
Browse files Browse the repository at this point in the history
* change condition for space after

* add NAUGHTY_STRINGS test example
  • Loading branch information
hiroshi-matsuda-rit authored Aug 25, 2020
1 parent 450720a commit 332803e
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
4 changes: 2 additions & 2 deletions spacy/lang/ja/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces

# align words and dtokens by referring text, and insert gap tokens for the space char spans
for word, dtoken in zip(words, dtokens):
for i, (word, dtoken) in enumerate(zip(words, dtokens)):
# skip all space tokens
if word.isspace():
continue
Expand All @@ -119,7 +119,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
text_spaces.append(False)
text_pos += len(word)
# poll a space char after the word
if text_pos < len(text) and text[text_pos] == " ":
if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ":
text_spaces[-1] = True
text_pos += 1

Expand Down
1 change: 1 addition & 0 deletions spacy/tests/tokenizer/test_naughty_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
r"₀₁₂",
r"⁰⁴⁵₀₁₂",
r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
r" ̄ ̄",
# Two-Byte Characters
r"田中さんにあげて下さい",
r"パーティーへ行かないか",
Expand Down

0 comments on commit 332803e

Please sign in to comment.