From e57eb4003152a3bfbbf8786d1c7a5ff1739ac1fb Mon Sep 17 00:00:00 2001 From: Sorami Hisamoto Date: Tue, 2 Jun 2020 09:51:36 +0900 Subject: [PATCH 1/4] Change a variable name to modifield_to_original, to make it align with the original Java implementation --- sudachipy/utf8inputtextbuilder.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sudachipy/utf8inputtextbuilder.py b/sudachipy/utf8inputtextbuilder.py index 4def75f..d79ab25 100644 --- a/sudachipy/utf8inputtextbuilder.py +++ b/sudachipy/utf8inputtextbuilder.py @@ -24,7 +24,7 @@ def __init__(self, text, grammar): self.grammar = grammar self.original_text = text self.modified_text = text - self.text_offsets = list(range(len(self.original_text) + 1)) + self.modified_to_original = list(range(len(self.original_text) + 1)) # 注: サロゲートペア文字は考慮していない def replace(self, begin, end, str_): @@ -42,15 +42,15 @@ def replace(self, begin, end, str_): self.modified_text = str_.join([self.modified_text[:begin], self.modified_text[end:]]) - offset = self.text_offsets[begin] + offset = self.modified_to_original[begin] length = len(str_) if end - begin > length: - del self.text_offsets[begin + length:end] + del self.modified_to_original[begin + length:end] for i in range(length): if begin + i < end: - self.text_offsets[begin + i] = offset + self.modified_to_original[begin + i] = offset else: - self.text_offsets.insert(begin + i, offset) + self.modified_to_original.insert(begin + i, offset) def get_original_text(self): return self.original_text @@ -70,10 +70,10 @@ def build(self): # 注: サロゲートペア文字は考慮していない for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))): byte_indexes[j] = i - offsets[j] = self.text_offsets[i] + offsets[j] = self.modified_to_original[i] j += 1 byte_indexes[length] = len(modified_string_text) - offsets[length] = self.text_offsets[-1] + offsets[length] = self.modified_to_original[-1] char_categories = self.get_char_category_types(modified_string_text) char_category_continuities = self.get_char_category_continuities(modified_string_text, length, char_categories) From 202dc1a024f1e550e24d665ddb4b0a7143db590f Mon Sep 17 00:00:00 2001 From: Sorami Hisamoto Date: Tue, 2 Jun 2020 10:08:18 +0900 Subject: [PATCH 2/4] =?UTF-8?q?Fix=20a=20bug=20causing=20=E2=80=A6=20is=20?= =?UTF-8?q?converted=20to=20"",=20"",=20"=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sudachipy/utf8inputtextbuilder.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sudachipy/utf8inputtextbuilder.py b/sudachipy/utf8inputtextbuilder.py index d79ab25..694375b 100644 --- a/sudachipy/utf8inputtextbuilder.py +++ b/sudachipy/utf8inputtextbuilder.py @@ -42,15 +42,17 @@ def replace(self, begin, end, str_): self.modified_text = str_.join([self.modified_text[:begin], self.modified_text[end:]]) - offset = self.modified_to_original[begin] + modified_begin = self.modified_to_original[begin] + modified_end = self.modified_to_original[end] length = len(str_) if end - begin > length: del self.modified_to_original[begin + length:end] - for i in range(length): + self.modified_to_original[begin] = modified_begin + for i in range(1, length): if begin + i < end: - self.modified_to_original[begin + i] = offset + self.modified_to_original[begin + i] = modified_end else: - self.modified_to_original.insert(begin + i, offset) + self.modified_to_original.insert(begin + i, modified_end) def get_original_text(self): return self.original_text From faeba33f099870c29fbf276ba0fa3a804ed837b0 Mon Sep 17 00:00:00 2001 From: Sorami Hisamoto Date: Tue, 2 Jun 2020 10:27:09 +0900 Subject: [PATCH 3/4] Fix tests for according to the new replace method --- tests/plugin/test_default_input_text_plugin.py | 5 +++-- tests/test_tokenizer.py | 10 ++++++++++ tests/test_utf8inputtext.py | 18 +++++++++--------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/tests/plugin/test_default_input_text_plugin.py b/tests/plugin/test_default_input_text_plugin.py index ccec51e..471706b 100644 --- a/tests/plugin/test_default_input_text_plugin.py +++ b/tests/plugin/test_default_input_text_plugin.py @@ -60,7 +60,7 @@ def test_before_rewrite(self): self.assertEqual(9, text.get_original_index(24)) self.assertEqual(9, text.get_original_index(26)) - def test_after_write(self): + def test_after_rewrite(self): self.assertEqual(self.original_text, self.builder.get_original_text()) self.assertEqual(self.original_text, self.builder.get_text()) self.plugin.rewrite(self.builder) @@ -76,7 +76,8 @@ def test_after_write(self): self.assertEqual(1, text.get_original_index(2)) self.assertEqual(2, text.get_original_index(3)) self.assertEqual(4, text.get_original_index(7)) - self.assertEqual(4, text.get_original_index(11)) + self.assertEqual(5, text.get_original_index(8)) + self.assertEqual(5, text.get_original_index(11)) self.assertEqual(7, text.get_original_index(15)) self.assertEqual(7, text.get_original_index(17)) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 65b4254..2ee548d 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -70,6 +70,16 @@ def test_tokenize_kanji_alphabet_word(self): self.assertEqual(len(self.tokenizer_obj.tokenize('ab')), 1) self.assertEqual(len(self.tokenizer_obj.tokenize('特ab')), 2) + def test_tokenizer_with_dots(self): + ms = self.tokenizer_obj.tokenize('京都…') + self.assertEqual(4, ms.size()) + self.assertEqual(ms[1].surface(), '…') + self.assertEqual(ms[1].normalized_form(), '.') + self.assertEqual(ms[2].surface(), '') + self.assertEqual(ms[2].normalized_form(), '.') + self.assertEqual(ms[3].surface(), '') + self.assertEqual(ms[3].normalized_form(), '.') + if __name__ == '__main__': unittest.main() diff --git a/tests/test_utf8inputtext.py b/tests/test_utf8inputtext.py index b92b851..affb551 100644 --- a/tests/test_utf8inputtext.py +++ b/tests/test_utf8inputtext.py @@ -117,8 +117,8 @@ def test_replace_with_same_length(self): self.assertEqual(input_.get_original_index(12), 7) self.assertEqual(input_.get_original_index(13), 8) self.assertEqual(input_.get_original_index(15), 8) - self.assertEqual(input_.get_original_index(16), 8) - self.assertEqual(input_.get_original_index(18), 8) + self.assertEqual(input_.get_original_index(16), 10) + self.assertEqual(input_.get_original_index(18), 10) self.assertEqual(input_.get_original_index(19), 10) self.assertEqual(input_.get_original_index(22), 10) self.assertEqual(input_.get_original_index(31), 13) @@ -147,13 +147,13 @@ def test_replaceWithInsertion(self): self.assertEqual(input_.get_original_text(), self.TEXT) self.assertEqual(input_.get_text(), "âbC1あ234あああ𡈽アゴ") self.assertEqual(len(input_.get_byte_text()), 35) - self.assertEqual(input_.get_original_index(0), 0) - self.assertEqual(input_.get_original_index(12), 7) - self.assertEqual(input_.get_original_index(13), 8) - self.assertEqual(input_.get_original_index(21), 8) - self.assertEqual(input_.get_original_index(22), 10) - self.assertEqual(input_.get_original_index(25), 10) - self.assertEqual(input_.get_original_index(35), 14) + self.assertEqual(input_.get_original_index(0), 0) # â + self.assertEqual(input_.get_original_index(12), 7) # 4 + self.assertEqual(input_.get_original_index(13), 8) # >あ< ああ + self.assertEqual(input_.get_original_index(21), 10) # ああ >あ< + self.assertEqual(input_.get_original_index(22), 10) # 𡈽 + self.assertEqual(input_.get_original_index(25), 10) # 𡈽 + self.assertEqual(input_.get_original_index(35), 14) # ゙ def test_replaceMultiTimes(self): self.builder.replace(0, 1, "a") From 360d9e922f81d36ff34c0722913ff37112555238 Mon Sep 17 00:00:00 2001 From: Sorami Hisamoto Date: Tue, 2 Jun 2020 10:30:27 +0900 Subject: [PATCH 4/4] Fix comment format --- tests/test_utf8inputtext.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_utf8inputtext.py b/tests/test_utf8inputtext.py index affb551..c7b0527 100644 --- a/tests/test_utf8inputtext.py +++ b/tests/test_utf8inputtext.py @@ -147,13 +147,13 @@ def test_replaceWithInsertion(self): self.assertEqual(input_.get_original_text(), self.TEXT) self.assertEqual(input_.get_text(), "âbC1あ234あああ𡈽アゴ") self.assertEqual(len(input_.get_byte_text()), 35) - self.assertEqual(input_.get_original_index(0), 0) # â - self.assertEqual(input_.get_original_index(12), 7) # 4 - self.assertEqual(input_.get_original_index(13), 8) # >あ< ああ - self.assertEqual(input_.get_original_index(21), 10) # ああ >あ< - self.assertEqual(input_.get_original_index(22), 10) # 𡈽 - self.assertEqual(input_.get_original_index(25), 10) # 𡈽 - self.assertEqual(input_.get_original_index(35), 14) # ゙ + self.assertEqual(input_.get_original_index(0), 0) # â + self.assertEqual(input_.get_original_index(12), 7) # 4 + self.assertEqual(input_.get_original_index(13), 8) # >あ< ああ + self.assertEqual(input_.get_original_index(21), 10) # ああ >あ< + self.assertEqual(input_.get_original_index(22), 10) # 𡈽 + self.assertEqual(input_.get_original_index(25), 10) # 𡈽 + self.assertEqual(input_.get_original_index(35), 14) # ゙ def test_replaceMultiTimes(self): self.builder.replace(0, 1, "a")