From dadf153bd41c106f20c7d2543058f71f75bf2168 Mon Sep 17 00:00:00 2001 From: nikkie Date: Sun, 20 Mar 2022 11:50:59 +0900 Subject: [PATCH 01/10] Fix typo (fix #238) --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 90bed6c..1c43b12 100644 --- a/docs/index.md +++ b/docs/index.md @@ -80,7 +80,7 @@ $ pip uninstall ginza ja_ginza_electra $ pip uninstall ja_ginza ``` -旧バージョンの`j_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。 +旧バージョンの`ja_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。 ```console $ pip uninstall ja_ginza_electra ``` From fe43469fff26c4c9d55c6ce6c151936b039dea0d Mon Sep 17 00:00:00 2001 From: r-terada Date: Wed, 23 Mar 2022 21:48:32 +0900 Subject: [PATCH 02/10] add pytest github actions workflow --- .github/workflows/pytest.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .github/workflows/pytest.yml diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..c1cd64c --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,34 @@ +name: pytest + +on: + push: + branches: + - master + pull_request: + branches: + - develop + +jobs: + pytest: + name: Run tests with pytest + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.7, 3.8] + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: >- + python -m + pip install -U pip + - name: Install dependencies + run: >- + python -m + pip install . pytest pytest-mock ja-ginza ja-ginza-electra + - name: Run Tests + run: pytest From 86c7b9b25f0eba7fbd868b8208b3daeaa5c62b2e Mon Sep 17 00:00:00 2001 From: r-terada Date: Thu, 24 Mar 2022 01:07:16 +0900 Subject: [PATCH 03/10] stop using deepcopy --- ginza/tests/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ginza/tests/test_models.py b/ginza/tests/test_models.py index 27b0951..a54e2f9 100644 --- a/ginza/tests/test_models.py +++ b/ginza/tests/test_models.py @@ -176,7 +176,6 @@ def test_tokenize(nlp, text, expected_tokens): def test_compound_spliter(nlp, text, len_a, len_b, len_c): assert len(nlp(text)) == len_c for split_mode, l in zip(["A", "B", "C"], [len_a, len_b, len_c]): - nlp = deepcopy(nlp) set_split_mode(nlp, split_mode) assert len(nlp(text)) == l From fa4be0aa2bec7085d862c9d61f207dd369eb1cc4 Mon Sep 17 00:00:00 2001 From: wafuwafu13 Date: Tue, 3 May 2022 20:34:42 +0900 Subject: [PATCH 04/10] docs(README.md): Add prompt --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 68aff5b..af147eb 100644 --- a/README.md +++ b/README.md @@ -534,10 +534,10 @@ Ginza uses the pytest framework for testing, and you can run the tests via `setu Some tests depends on the ginza default models (`ja-ginza`, `ja-ginza-electra`), so install them before the tests is needed. ```console -pip install ja-ginza ja-ginza-electra -pip install -e . +$ pip install ja-ginza ja-ginza-electra +$ pip install -e . # full test -python setup.py test +$ python setup.py test # test single file -python setup.py test --addopts ginza/tests/test_analyzer.py +$ python setup.py test --addopts ginza/tests/test_analyzer.py ``` From fdcc975f800d11379f65d4858affaea75af51396 Mon Sep 17 00:00:00 2001 From: wafuwafu13 Date: Tue, 3 May 2022 20:39:11 +0900 Subject: [PATCH 05/10] test(analyzer): Add split_mode --- ginza/tests/test_analyzer.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ginza/tests/test_analyzer.py b/ginza/tests/test_analyzer.py index df857bd..17b9204 100644 --- a/ginza/tests/test_analyzer.py +++ b/ginza/tests/test_analyzer.py @@ -144,3 +144,26 @@ def test_analyze_batch(self, output_format, input_batch, tokens_batch, tokens_fu analyzer.output_format = output_format ret = analyzer.analyze_batch(input_batch) assert tokens_func(ret) == sum(tokens_batch, []) + + @pytest.mark.parametrize( + "raises_analysis_before_set, tokens_func", + [ + (TypeError, _tokens_conllu) + ], + ) + @pytest.mark.parametrize( + "split_mode, input_text, tokens", + [ + ("A", "機能性食品", ["機能", "性", "食品"]), + ("B", "機能性食品", ["機能性", "食品"]), + ("C", "機能性食品", ["機能性食品"]), + ], + ) + def test_analyze_split(self, split_mode, input_text, tokens, raises_analysis_before_set, tokens_func, analyzer): + analyzer.split_mode = split_mode + with pytest.raises(raises_analysis_before_set): + analyzer.analyze_line(input_text) + + analyzer.set_nlp() + ret = analyzer.analyze_line(input_text) + assert tokens_func(ret) == tokens From ba7a6b9af7ca3352f3089450a7137088371bfc15 Mon Sep 17 00:00:00 2001 From: wafuwafu13 Date: Wed, 4 May 2022 08:51:22 +0900 Subject: [PATCH 06/10] refactor(analyzer): Improve error message --- ginza/analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ginza/analyzer.py b/ginza/analyzer.py index 5e1f7c6..f25931e 100644 --- a/ginza/analyzer.py +++ b/ginza/analyzer.py @@ -77,7 +77,7 @@ def set_nlp(self) -> None: try: nlp = spacy.load("ja_ginza") except IOError as e: - raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza`.') + raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.') if self.disable_sentencizer: nlp.add_pipe("disable_sentencizer", before="parser") From d60d3d4bf4af0c5879357a5f95b36f72ea8eb317 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Thu, 5 May 2022 12:42:22 +0900 Subject: [PATCH 07/10] add python_requires #243 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 5e307d8..92b859c 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ "ginzame = ginza.command_line:main_ginzame", ], }, + python_requires=">=3.6", install_requires=[ "spacy>=3.2.0,<3.3.0", "plac>=1.3.3", From 4ac7809acd5ad4c6ee91ffed04a42f552a97d392 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Tue, 9 Aug 2022 18:00:56 +0900 Subject: [PATCH 08/10] migrate to spacy v3.4 --- config/ja_ginza.meta.json | 4 ++-- config/ja_ginza_electra.meta.json | 4 ++-- ginza/command_line.py | 15 +++++++++++---- requirements.txt | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/config/ja_ginza.meta.json b/config/ja_ginza.meta.json index d276a14..343c229 100644 --- a/config/ja_ginza.meta.json +++ b/config/ja_ginza.meta.json @@ -1,7 +1,7 @@ { "lang":"ja", "name":"ginza", - "version":"5.1.0", + "version":"5.1.2", "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", @@ -34,7 +34,7 @@ } ], "parent_package":"spacy", - "spacy_version":">=3.2.0,<3.3.0", + "spacy_version":">=3.2.0,<3.5.0", "pipeline":[ "tok2vec", "parser", diff --git a/config/ja_ginza_electra.meta.json b/config/ja_ginza_electra.meta.json index 658d887..50390f2 100644 --- a/config/ja_ginza_electra.meta.json +++ b/config/ja_ginza_electra.meta.json @@ -1,7 +1,7 @@ { "lang":"ja", "name":"ginza_electra", - "version":"5.1.0", + "version":"5.1.2", "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", @@ -41,7 +41,7 @@ } ], "parent_package":"spacy", - "spacy_version":">=3.2.0,<3.3.0", + "spacy_version":">=3.2.0,<3.5.0", "pipeline":[ "transformer", "parser", diff --git a/ginza/command_line.py b/ginza/command_line.py index 01c25e7..2edacc7 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -28,7 +28,7 @@ def is_json(self): def open(self): if self.output_path: - self.output = open(self.output_path, "w") + self.output = open(self.output_path, "w", encoding="utf-8") else: self.output = sys.stdout @@ -154,10 +154,17 @@ def _analyze_tty(analyzer: Analyzer, output: _OutputWrapper) -> None: def _analyze_single(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str]) -> None: try: analyzer.set_nlp() + batch = [] for path in files: - with open(path, "r") as f: + with open(path, "r", encoding="utf-8") as f: for line in f: - output.write(analyzer.analyze_line(line)) + batch.append(line) + if len(batch) < MINI_BATCH_SIZE: + continue + output.write(analyzer.analyze_batch(batch)) + batch.clear() + if batch: + output.write(analyzer.analyze_batch(batch)) except KeyboardInterrupt: pass @@ -194,7 +201,7 @@ def _analyze_parallel(analyzer: Analyzer, output: _OutputWrapper, files: Iterabl def _data_loader(files: List[str], batch_size: int) -> Generator[List[str], None, None]: mini_batch = [] for path in files: - with open(path, "r") as f: + with open(path, "r", encoding="utf-8") as f: for line in f: mini_batch.append(line) if len(mini_batch) == batch_size: diff --git a/requirements.txt b/requirements.txt index 52ee66b..2f6952d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -spacy>=3.2.0,<3.3.0 +spacy>=3.2.0,<3.5.0 plac>=1.3.3 SudachiPy>=0.6.2,<0.7.0 SudachiDict-core>=20210802 From 7991f0dab50f93ce5dc3fb088068a15325567703 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Tue, 9 Aug 2022 18:03:54 +0900 Subject: [PATCH 09/10] migrate to spacy v3.4 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 92b859c..14a5a3a 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ }, python_requires=">=3.6", install_requires=[ - "spacy>=3.2.0,<3.3.0", + "spacy>=3.2.0,<3.5.0", "plac>=1.3.3", "SudachiPy>=0.6.2,<0.7.0", "SudachiDict-core>=20210802", @@ -29,5 +29,5 @@ name="ginza", packages=find_packages(include=["ginza"]), url="https://github.com/megagonlabs/ginza", - version='5.1.1', + version='5.1.2', ) From cea258e18f52c5611b76b1b4fa19ac66b5b14276 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Tue, 9 Aug 2022 18:10:03 +0900 Subject: [PATCH 10/10] update readme --- README.md | 10 ++++++++++ docs/index.md | 11 +++++++++++ 2 files changed, 21 insertions(+) diff --git a/README.md b/README.md index af147eb..3797772 100644 --- a/README.md +++ b/README.md @@ -234,6 +234,16 @@ Please read the official documents to compile user dictionaries with `sudachipy` ### version 5.x +#### ginza-5.1.2 +- 2022-03-12 +- Migrate to spaCy v3.4 + +#### ginza-5.1.1 +- 2022-03-12 +- Improvements + - auto deploy for pypi by @nimiusrd in #184 + - modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233 + #### ginza-5.1.0 - 2021-12-10, Euclase - Important changes diff --git a/docs/index.md b/docs/index.md index 1c43b12..3a59c5d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -261,6 +261,17 @@ Contains information from mC4 which is made available under the ODC Attribution ### version 5.x +#### ginza-5.1.2 +- 2022-03-12 +- Migrate to spaCy v3.4 + +#### ginza-5.1.1 +- 2022-03-12 +- Improvements + - auto deploy for pypi by @nimiusrd in #184 + - modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233 + + #### ginza-5.1.0 - 2021-12-10, Euclase - 重要な変更