From dadf153bd41c106f20c7d2543058f71f75bf2168 Mon Sep 17 00:00:00 2001
From: nikkie <takuyafjp+develop@gmail.com>
Date: Sun, 20 Mar 2022 11:50:59 +0900
Subject: [PATCH 01/10] Fix typo (fix #238)

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 90bed6c..1c43b12 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -80,7 +80,7 @@ $ pip uninstall ginza ja_ginza_electra
 $ pip uninstall ja_ginza
 ```
 
-旧バージョンの`j_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
+旧バージョンの`ja_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
 ```console
 $ pip uninstall ja_ginza_electra
 ```

From fe43469fff26c4c9d55c6ce6c151936b039dea0d Mon Sep 17 00:00:00 2001
From: r-terada <r.terada1993@gmail.com>
Date: Wed, 23 Mar 2022 21:48:32 +0900
Subject: [PATCH 02/10] add pytest github actions workflow

---
 .github/workflows/pytest.yml | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 .github/workflows/pytest.yml

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
new file mode 100644
index 0000000..c1cd64c
--- /dev/null
+++ b/.github/workflows/pytest.yml
@@ -0,0 +1,34 @@
+name: pytest
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - develop
+
+jobs:
+  pytest:
+    name: Run tests with pytest
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: >-
+          python -m
+          pip install -U pip
+      - name: Install dependencies
+        run: >-
+          python -m
+          pip install . pytest pytest-mock ja-ginza ja-ginza-electra
+      - name: Run Tests
+        run: pytest

From 86c7b9b25f0eba7fbd868b8208b3daeaa5c62b2e Mon Sep 17 00:00:00 2001
From: r-terada <r.terada1993@gmail.com>
Date: Thu, 24 Mar 2022 01:07:16 +0900
Subject: [PATCH 03/10] stop using deepcopy

---
 ginza/tests/test_models.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ginza/tests/test_models.py b/ginza/tests/test_models.py
index 27b0951..a54e2f9 100644
--- a/ginza/tests/test_models.py
+++ b/ginza/tests/test_models.py
@@ -176,7 +176,6 @@ def test_tokenize(nlp, text, expected_tokens):
 def test_compound_spliter(nlp, text, len_a, len_b, len_c):
     assert len(nlp(text)) == len_c
     for split_mode, l in zip(["A", "B", "C"], [len_a, len_b, len_c]):
-        nlp = deepcopy(nlp)
         set_split_mode(nlp, split_mode)
         assert len(nlp(text)) == l
 

From fa4be0aa2bec7085d862c9d61f207dd369eb1cc4 Mon Sep 17 00:00:00 2001
From: wafuwafu13 <mariobaske@i.softbank.jp>
Date: Tue, 3 May 2022 20:34:42 +0900
Subject: [PATCH 04/10] docs(README.md): Add prompt

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 68aff5b..af147eb 100644
--- a/README.md
+++ b/README.md
@@ -534,10 +534,10 @@ Ginza uses the pytest framework for testing, and you can run the tests via `setu
 Some tests depends on the ginza default models (`ja-ginza`, `ja-ginza-electra`), so install them before the tests is needed.
 
 ```console
-pip install ja-ginza ja-ginza-electra
-pip install -e .
+$ pip install ja-ginza ja-ginza-electra
+$ pip install -e .
 # full test
-python setup.py test
+$ python setup.py test
 # test single file
-python setup.py test --addopts ginza/tests/test_analyzer.py
+$ python setup.py test --addopts ginza/tests/test_analyzer.py
 ```

From fdcc975f800d11379f65d4858affaea75af51396 Mon Sep 17 00:00:00 2001
From: wafuwafu13 <mariobaske@i.softbank.jp>
Date: Tue, 3 May 2022 20:39:11 +0900
Subject: [PATCH 05/10] test(analyzer): Add split_mode

---
 ginza/tests/test_analyzer.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/ginza/tests/test_analyzer.py b/ginza/tests/test_analyzer.py
index df857bd..17b9204 100644
--- a/ginza/tests/test_analyzer.py
+++ b/ginza/tests/test_analyzer.py
@@ -144,3 +144,26 @@ def test_analyze_batch(self, output_format, input_batch, tokens_batch, tokens_fu
         analyzer.output_format = output_format
         ret = analyzer.analyze_batch(input_batch)
         assert tokens_func(ret) == sum(tokens_batch, [])
+
+    @pytest.mark.parametrize(
+        "raises_analysis_before_set, tokens_func",
+        [
+            (TypeError, _tokens_conllu)
+        ],
+    )
+    @pytest.mark.parametrize(
+        "split_mode, input_text, tokens",
+        [
+            ("A", "機能性食品", ["機能", "性", "食品"]),
+            ("B", "機能性食品", ["機能性", "食品"]),
+            ("C", "機能性食品", ["機能性食品"]),
+        ],
+    )
+    def test_analyze_split(self, split_mode, input_text, tokens, raises_analysis_before_set, tokens_func, analyzer):
+        analyzer.split_mode = split_mode
+        with pytest.raises(raises_analysis_before_set):
+            analyzer.analyze_line(input_text)
+
+        analyzer.set_nlp()
+        ret = analyzer.analyze_line(input_text)
+        assert tokens_func(ret) == tokens

From ba7a6b9af7ca3352f3089450a7137088371bfc15 Mon Sep 17 00:00:00 2001
From: wafuwafu13 <mariobaske@i.softbank.jp>
Date: Wed, 4 May 2022 08:51:22 +0900
Subject: [PATCH 06/10] refactor(analyzer): Improve error message

---
 ginza/analyzer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ginza/analyzer.py b/ginza/analyzer.py
index 5e1f7c6..f25931e 100644
--- a/ginza/analyzer.py
+++ b/ginza/analyzer.py
@@ -77,7 +77,7 @@ def set_nlp(self) -> None:
                     try:
                         nlp = spacy.load("ja_ginza")
                     except IOError as e:
-                        raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza`.')
+                        raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.')
 
             if self.disable_sentencizer:
                 nlp.add_pipe("disable_sentencizer", before="parser")

From d60d3d4bf4af0c5879357a5f95b36f72ea8eb317 Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <hiroshi_matsuda@megagon.ai>
Date: Thu, 5 May 2022 12:42:22 +0900
Subject: [PATCH 07/10] add python_requires #243

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 5e307d8..92b859c 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@
             "ginzame = ginza.command_line:main_ginzame",
         ],
     },
+    python_requires=">=3.6",
     install_requires=[
         "spacy>=3.2.0,<3.3.0",
         "plac>=1.3.3",

From 4ac7809acd5ad4c6ee91ffed04a42f552a97d392 Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <hiroshi_matsuda@megagon.ai>
Date: Tue, 9 Aug 2022 18:00:56 +0900
Subject: [PATCH 08/10] migrate to spacy v3.4

---
 config/ja_ginza.meta.json         |  4 ++--
 config/ja_ginza_electra.meta.json |  4 ++--
 ginza/command_line.py             | 15 +++++++++++----
 requirements.txt                  |  2 +-
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/config/ja_ginza.meta.json b/config/ja_ginza.meta.json
index d276a14..343c229 100644
--- a/config/ja_ginza.meta.json
+++ b/config/ja_ginza.meta.json
@@ -1,7 +1,7 @@
 {
   "lang":"ja",
   "name":"ginza",
-  "version":"5.1.0",
+  "version":"5.1.2",
   "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.",
   "author":"Megagon Labs Tokyo.",
   "email":"ginza@megagon.ai",
@@ -34,7 +34,7 @@
     }
   ],
   "parent_package":"spacy",
-  "spacy_version":">=3.2.0,<3.3.0",
+  "spacy_version":">=3.2.0,<3.5.0",
   "pipeline":[
     "tok2vec",
     "parser",
diff --git a/config/ja_ginza_electra.meta.json b/config/ja_ginza_electra.meta.json
index 658d887..50390f2 100644
--- a/config/ja_ginza_electra.meta.json
+++ b/config/ja_ginza_electra.meta.json
@@ -1,7 +1,7 @@
 {
   "lang":"ja",
   "name":"ginza_electra",
-  "version":"5.1.0",
+  "version":"5.1.2",
   "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.",
   "author":"Megagon Labs Tokyo.",
   "email":"ginza@megagon.ai",
@@ -41,7 +41,7 @@
     }
   ],
   "parent_package":"spacy",
-  "spacy_version":">=3.2.0,<3.3.0",
+  "spacy_version":">=3.2.0,<3.5.0",
   "pipeline":[
     "transformer",
     "parser",
diff --git a/ginza/command_line.py b/ginza/command_line.py
index 01c25e7..2edacc7 100644
--- a/ginza/command_line.py
+++ b/ginza/command_line.py
@@ -28,7 +28,7 @@ def is_json(self):
 
     def open(self):
         if self.output_path:
-            self.output = open(self.output_path, "w")
+            self.output = open(self.output_path, "w", encoding="utf-8")
         else:
             self.output = sys.stdout
 
@@ -154,10 +154,17 @@ def _analyze_tty(analyzer: Analyzer, output: _OutputWrapper) -> None:
 def _analyze_single(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str]) -> None:
     try:
         analyzer.set_nlp()
+        batch = []
         for path in files:
-            with open(path, "r") as f:
+            with open(path, "r", encoding="utf-8") as f:
                 for line in f:
-                    output.write(analyzer.analyze_line(line))
+                    batch.append(line)
+                    if len(batch) < MINI_BATCH_SIZE:
+                        continue
+                    output.write(analyzer.analyze_batch(batch))
+                    batch.clear()
+        if batch:
+            output.write(analyzer.analyze_batch(batch))
     except KeyboardInterrupt:
         pass
 
@@ -194,7 +201,7 @@ def _analyze_parallel(analyzer: Analyzer, output: _OutputWrapper, files: Iterabl
 def _data_loader(files: List[str], batch_size: int) -> Generator[List[str], None, None]:
     mini_batch = []
     for path in files:
-        with open(path, "r") as f:
+        with open(path, "r", encoding="utf-8") as f:
             for line in f:
                 mini_batch.append(line)
                 if len(mini_batch) == batch_size:
diff --git a/requirements.txt b/requirements.txt
index 52ee66b..2f6952d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-spacy>=3.2.0,<3.3.0
+spacy>=3.2.0,<3.5.0
 plac>=1.3.3
 SudachiPy>=0.6.2,<0.7.0
 SudachiDict-core>=20210802

From 7991f0dab50f93ce5dc3fb088068a15325567703 Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <hiroshi_matsuda@megagon.ai>
Date: Tue, 9 Aug 2022 18:03:54 +0900
Subject: [PATCH 09/10] migrate to spacy v3.4

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 92b859c..14a5a3a 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
     },
     python_requires=">=3.6",
     install_requires=[
-        "spacy>=3.2.0,<3.3.0",
+        "spacy>=3.2.0,<3.5.0",
         "plac>=1.3.3",
         "SudachiPy>=0.6.2,<0.7.0",
         "SudachiDict-core>=20210802",
@@ -29,5 +29,5 @@
     name="ginza",
     packages=find_packages(include=["ginza"]),
     url="https://github.com/megagonlabs/ginza",
-    version='5.1.1',
+    version='5.1.2',
 )

From cea258e18f52c5611b76b1b4fa19ac66b5b14276 Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <hiroshi_matsuda@megagon.ai>
Date: Tue, 9 Aug 2022 18:10:03 +0900
Subject: [PATCH 10/10] update readme

---
 README.md     | 10 ++++++++++
 docs/index.md | 11 +++++++++++
 2 files changed, 21 insertions(+)

diff --git a/README.md b/README.md
index af147eb..3797772 100644
--- a/README.md
+++ b/README.md
@@ -234,6 +234,16 @@ Please read the official documents to compile user dictionaries with `sudachipy`
 
 ### version 5.x
 
+#### ginza-5.1.2
+- 2022-03-12
+- Migrate to spaCy v3.4
+
+#### ginza-5.1.1
+- 2022-03-12
+- Improvements
+  - auto deploy for pypi by @nimiusrd in #184
+  - modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233
+
 #### ginza-5.1.0
 - 2021-12-10, Euclase
 - Important changes
diff --git a/docs/index.md b/docs/index.md
index 1c43b12..3a59c5d 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -261,6 +261,17 @@ Contains information from mC4 which is made available under the ODC Attribution
 
 ### version 5.x
 
+#### ginza-5.1.2
+- 2022-03-12
+- Migrate to spaCy v3.4
+
+#### ginza-5.1.1
+- 2022-03-12
+- Improvements
+  - auto deploy for pypi by @nimiusrd in #184
+  - modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233
+
+
 #### ginza-5.1.0
 - 2021-12-10, Euclase
 - 重要な変更