From ebeaeba454e3446dbe7a6c1a7cf835cfa72f31c2 Mon Sep 17 00:00:00 2001 From: r-terada Date: Tue, 23 Nov 2021 07:52:40 +0900 Subject: [PATCH 1/5] assert hash_comment == "analyze" if output format is json print or skip hash comment break json format, and json paraser becomes unable to parse the result --- ginza/command_line.py | 1 + ginza/tests/test_command_line.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ginza/command_line.py b/ginza/command_line.py index 856b574..12df5be 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -62,6 +62,7 @@ def run( files: List[str] = None, ): assert model_path is None or ensure_model is None + assert not (output_format in ["3", "json"] and hash_comment != "analyze"), "output_format=json only accept hash_comment=analyze" if parallel_level <= 0: level = max(1, cpu_count() + parallel_level) diff --git a/ginza/tests/test_command_line.py b/ginza/tests/test_command_line.py index 2bd5484..efefc85 100644 --- a/ginza/tests/test_command_line.py +++ b/ginza/tests/test_command_line.py @@ -56,7 +56,7 @@ def output_file(tmpdir: Path) -> Path: def _conllu_parsable(result: str): for line in result.split("\n"): - if line.startswith("# text = ") or line.strip() == "": + if line.startswith("#") or line.strip() == "": continue if not len(line.strip().split("\t")) == 10: raise Exception @@ -64,7 +64,7 @@ def _conllu_parsable(result: str): def _cabocha_parsable(result: str): for line in result.split("\n"): - if line.strip() in ("", "EOS") or line.startswith("*"): + if line.strip() in ("", "EOS") or line.startswith("*") or line.startswith("#"): continue if not len(line.split("\t")) == 3: raise Exception @@ -74,7 +74,7 @@ def _cabocha_parsable(result: str): def _mecab_parsable(result: str): for line in result.split("\n"): - if line.strip() in ("", "EOS"): + if line.strip() in ("", "EOS") or line.startswith("#"): continue if not len(line.split("\t")) == 2: raise Exception @@ -210,6 +210,13 @@ def test_output_format(self, output_format, result_parsable, input_file): assert p.returncode == 0 result_parsable(p.stdout.strip()) + @pytest.mark.parametrize( + "hash_comment", ["print", "skip"] + ) + def test_json_cannot_accept_hash_comment_not_analyze(self, hash_comment, input_file): + p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file]) + assert p.returncode != 0 + def test_require_gpu(self, input_file): p = run_cmd(["ginza", "-g", input_file]) gpu_available = int(os.environ.get("CUDA_VISIBLE_DEVICES", -1)) > 0 From cc9324241d027094ddb3617c72533f3060fc90d5 Mon Sep 17 00:00:00 2001 From: r-terada Date: Tue, 23 Nov 2021 07:59:26 +0900 Subject: [PATCH 2/5] remove msg, according to assert about model_path --- ginza/command_line.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ginza/command_line.py b/ginza/command_line.py index 12df5be..046e86a 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -62,7 +62,7 @@ def run( files: List[str] = None, ): assert model_path is None or ensure_model is None - assert not (output_format in ["3", "json"] and hash_comment != "analyze"), "output_format=json only accept hash_comment=analyze" + assert not (output_format in ["3", "json"] and hash_comment != "analyze") if parallel_level <= 0: level = max(1, cpu_count() + parallel_level) From 3f0d518beb0b9957a4225a88725290cc5a08e7fe Mon Sep 17 00:00:00 2001 From: r-terada Date: Tue, 23 Nov 2021 09:07:16 +0900 Subject: [PATCH 3/5] stop using assert, only warning --- ginza/command_line.py | 7 ++++++- ginza/tests/test_command_line.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/ginza/command_line.py b/ginza/command_line.py index 046e86a..056e68a 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -62,7 +62,12 @@ def run( files: List[str] = None, ): assert model_path is None or ensure_model is None - assert not (output_format in ["3", "json"] and hash_comment != "analyze") + if output_format in ["3", "json"] and hash_comment != "analyze": + print( + f'hash_comment={hash_comment} may break output json if input contains a line starts with "#".\n' + 'In order to keep the json in proper format, please use hash_comment=analyze or remove the lines start with "#" from input.', + file=sys.stderr + ) if parallel_level <= 0: level = max(1, cpu_count() + parallel_level) diff --git a/ginza/tests/test_command_line.py b/ginza/tests/test_command_line.py index efefc85..b9ea14e 100644 --- a/ginza/tests/test_command_line.py +++ b/ginza/tests/test_command_line.py @@ -213,9 +213,14 @@ def test_output_format(self, output_format, result_parsable, input_file): @pytest.mark.parametrize( "hash_comment", ["print", "skip"] ) - def test_json_cannot_accept_hash_comment_not_analyze(self, hash_comment, input_file): - p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file]) - assert p.returncode != 0 + def test_warn_if_json_hash_comment_not_analyze(self, hash_comment, input_file): + p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file], stderr=sp.PIPE) + assert p.returncode == 0 + msg = ( + f'hash_comment={hash_comment} may break output json if input contains a line starts with "#".\n' + 'In order to keep the json in proper format, please use hash_comment=analyze or remove the lines start with "#" from input.' + ) + assert msg in p.stderr def test_require_gpu(self, input_file): p = run_cmd(["ginza", "-g", input_file]) From d07602e0a75e75e9fd2032ba94b376dfc08d1aff Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Tue, 23 Nov 2021 11:30:49 +0900 Subject: [PATCH 4/5] change warning message --- ginza/command_line.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ginza/command_line.py b/ginza/command_line.py index 056e68a..736fe6a 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -64,8 +64,7 @@ def run( assert model_path is None or ensure_model is None if output_format in ["3", "json"] and hash_comment != "analyze": print( - f'hash_comment={hash_comment} may break output json if input contains a line starts with "#".\n' - 'In order to keep the json in proper format, please use hash_comment=analyze or remove the lines start with "#" from input.', + f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyzer".', file=sys.stderr ) From 73972b45e1f0f6f399893d72b86d0c72921a040a Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Tue, 23 Nov 2021 11:33:15 +0900 Subject: [PATCH 5/5] typo --- ginza/command_line.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ginza/command_line.py b/ginza/command_line.py index 736fe6a..8db72e6 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -64,7 +64,7 @@ def run( assert model_path is None or ensure_model is None if output_format in ["3", "json"] and hash_comment != "analyze": print( - f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyzer".', + f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyze".', file=sys.stderr )