Skip to content

Commit

Permalink
Merge pull request #250 from megagonlabs/develop
Browse files Browse the repository at this point in the history
Release v5.1.2
  • Loading branch information
hiroshi-matsuda-rit authored Aug 9, 2022
2 parents 24dee81 + d94979d commit 49c93ad
Show file tree
Hide file tree
Showing 11 changed files with 103 additions and 18 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: pytest

on:
push:
branches:
- master
pull_request:
branches:
- develop

jobs:
pytest:
name: Run tests with pytest
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7, 3.8]
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
run: >-
python -m
pip install -U pip
- name: Install dependencies
run: >-
python -m
pip install . pytest pytest-mock ja-ginza ja-ginza-electra
- name: Run Tests
run: pytest
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,16 @@ Please read the official documents to compile user dictionaries with `sudachipy`

### version 5.x

#### ginza-5.1.2
- 2022-03-12
- Migrate to spaCy v3.4

#### ginza-5.1.1
- 2022-03-12
- Improvements
- auto deploy for pypi by @nimiusrd in #184
- modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233

#### ginza-5.1.0
- 2021-12-10, Euclase
- Important changes
Expand Down Expand Up @@ -534,10 +544,10 @@ Ginza uses the pytest framework for testing, and you can run the tests via `setu
Some tests depends on the ginza default models (`ja-ginza`, `ja-ginza-electra`), so install them before the tests is needed.

```console
pip install ja-ginza ja-ginza-electra
pip install -e .
$ pip install ja-ginza ja-ginza-electra
$ pip install -e .
# full test
python setup.py test
$ python setup.py test
# test single file
python setup.py test --addopts ginza/tests/test_analyzer.py
$ python setup.py test --addopts ginza/tests/test_analyzer.py
```
4 changes: 2 additions & 2 deletions config/ja_ginza.meta.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"lang":"ja",
"name":"ginza",
"version":"5.1.0",
"version":"5.1.2",
"description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.",
"author":"Megagon Labs Tokyo.",
"email":"ginza@megagon.ai",
Expand Down Expand Up @@ -34,7 +34,7 @@
}
],
"parent_package":"spacy",
"spacy_version":">=3.2.0,<3.3.0",
"spacy_version":">=3.2.0,<3.5.0",
"pipeline":[
"tok2vec",
"parser",
Expand Down
4 changes: 2 additions & 2 deletions config/ja_ginza_electra.meta.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"lang":"ja",
"name":"ginza_electra",
"version":"5.1.0",
"version":"5.1.2",
"description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.",
"author":"Megagon Labs Tokyo.",
"email":"ginza@megagon.ai",
Expand Down Expand Up @@ -41,7 +41,7 @@
}
],
"parent_package":"spacy",
"spacy_version":">=3.2.0,<3.3.0",
"spacy_version":">=3.2.0,<3.5.0",
"pipeline":[
"transformer",
"parser",
Expand Down
13 changes: 12 additions & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ $ pip uninstall ginza ja_ginza_electra
$ pip uninstall ja_ginza
```

旧バージョンの`j_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
旧バージョンの`ja_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
```console
$ pip uninstall ja_ginza_electra
```
Expand Down Expand Up @@ -261,6 +261,17 @@ Contains information from mC4 which is made available under the ODC Attribution

### version 5.x

#### ginza-5.1.2
- 2022-03-12
- Migrate to spaCy v3.4

#### ginza-5.1.1
- 2022-03-12
- Improvements
- auto deploy for pypi by @nimiusrd in #184
- modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233


#### ginza-5.1.0
- 2021-12-10, Euclase
- 重要な変更
Expand Down
2 changes: 1 addition & 1 deletion ginza/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def set_nlp(self) -> None:
try:
nlp = spacy.load("ja_ginza")
except IOError as e:
raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza`.')
raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.')

if self.disable_sentencizer:
nlp.add_pipe("disable_sentencizer", before="parser")
Expand Down
15 changes: 11 additions & 4 deletions ginza/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def is_json(self):

def open(self):
if self.output_path:
self.output = open(self.output_path, "w")
self.output = open(self.output_path, "w", encoding="utf-8")
else:
self.output = sys.stdout

Expand Down Expand Up @@ -154,10 +154,17 @@ def _analyze_tty(analyzer: Analyzer, output: _OutputWrapper) -> None:
def _analyze_single(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str]) -> None:
try:
analyzer.set_nlp()
batch = []
for path in files:
with open(path, "r") as f:
with open(path, "r", encoding="utf-8") as f:
for line in f:
output.write(analyzer.analyze_line(line))
batch.append(line)
if len(batch) < MINI_BATCH_SIZE:
continue
output.write(analyzer.analyze_batch(batch))
batch.clear()
if batch:
output.write(analyzer.analyze_batch(batch))
except KeyboardInterrupt:
pass

Expand Down Expand Up @@ -194,7 +201,7 @@ def _analyze_parallel(analyzer: Analyzer, output: _OutputWrapper, files: Iterabl
def _data_loader(files: List[str], batch_size: int) -> Generator[List[str], None, None]:
mini_batch = []
for path in files:
with open(path, "r") as f:
with open(path, "r", encoding="utf-8") as f:
for line in f:
mini_batch.append(line)
if len(mini_batch) == batch_size:
Expand Down
23 changes: 23 additions & 0 deletions ginza/tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,26 @@ def test_analyze_batch(self, output_format, input_batch, tokens_batch, tokens_fu
analyzer.output_format = output_format
ret = analyzer.analyze_batch(input_batch)
assert tokens_func(ret) == sum(tokens_batch, [])

@pytest.mark.parametrize(
"raises_analysis_before_set, tokens_func",
[
(TypeError, _tokens_conllu)
],
)
@pytest.mark.parametrize(
"split_mode, input_text, tokens",
[
("A", "機能性食品", ["機能", "性", "食品"]),
("B", "機能性食品", ["機能性", "食品"]),
("C", "機能性食品", ["機能性食品"]),
],
)
def test_analyze_split(self, split_mode, input_text, tokens, raises_analysis_before_set, tokens_func, analyzer):
analyzer.split_mode = split_mode
with pytest.raises(raises_analysis_before_set):
analyzer.analyze_line(input_text)

analyzer.set_nlp()
ret = analyzer.analyze_line(input_text)
assert tokens_func(ret) == tokens
1 change: 0 additions & 1 deletion ginza/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ def test_tokenize(nlp, text, expected_tokens):
def test_compound_spliter(nlp, text, len_a, len_b, len_c):
assert len(nlp(text)) == len_c
for split_mode, l in zip(["A", "B", "C"], [len_a, len_b, len_c]):
nlp = deepcopy(nlp)
set_split_mode(nlp, split_mode)
assert len(nlp(text)) == l

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
spacy>=3.2.0,<3.3.0
spacy>=3.2.0,<3.5.0
plac>=1.3.3
SudachiPy>=0.6.2,<0.7.0
SudachiDict-core>=20210802
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
"ginzame = ginza.command_line:main_ginzame",
],
},
python_requires=">=3.6",
install_requires=[
"spacy>=3.2.0,<3.3.0",
"spacy>=3.2.0,<3.5.0",
"plac>=1.3.3",
"SudachiPy>=0.6.2,<0.7.0",
"SudachiDict-core>=20210802",
Expand All @@ -28,5 +29,5 @@
name="ginza",
packages=find_packages(include=["ginza"]),
url="https://github.com/megagonlabs/ginza",
version='5.1.1',
version='5.1.2',
)

0 comments on commit 49c93ad

Please sign in to comment.