From ebeaeba454e3446dbe7a6c1a7cf835cfa72f31c2 Mon Sep 17 00:00:00 2001
From: r-terada <r.terada1993@gmail.com>
Date: Tue, 23 Nov 2021 07:52:40 +0900
Subject: [PATCH 1/5] assert hash_comment == "analyze" if output format is json

print or skip hash comment break json format, and json paraser becomes unable to parse the result
---
 ginza/command_line.py            |  1 +
 ginza/tests/test_command_line.py | 13 ++++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/ginza/command_line.py b/ginza/command_line.py
index 856b574..12df5be 100644
--- a/ginza/command_line.py
+++ b/ginza/command_line.py
@@ -62,6 +62,7 @@ def run(
     files: List[str] = None,
 ):
     assert model_path is None or ensure_model is None
+    assert not (output_format in ["3", "json"] and hash_comment != "analyze"), "output_format=json only accept hash_comment=analyze"
 
     if parallel_level <= 0:
         level = max(1, cpu_count() + parallel_level)
diff --git a/ginza/tests/test_command_line.py b/ginza/tests/test_command_line.py
index 2bd5484..efefc85 100644
--- a/ginza/tests/test_command_line.py
+++ b/ginza/tests/test_command_line.py
@@ -56,7 +56,7 @@ def output_file(tmpdir: Path) -> Path:
 
 def _conllu_parsable(result: str):
     for line in result.split("\n"):
-        if line.startswith("# text = ") or line.strip() == "":
+        if line.startswith("#") or line.strip() == "":
             continue
         if not len(line.strip().split("\t")) == 10:
             raise Exception
@@ -64,7 +64,7 @@ def _conllu_parsable(result: str):
 
 def _cabocha_parsable(result: str):
     for line in result.split("\n"):
-        if line.strip() in ("", "EOS") or line.startswith("*"):
+        if line.strip() in ("", "EOS") or line.startswith("*") or line.startswith("#"):
             continue
         if not len(line.split("\t")) == 3:
             raise Exception
@@ -74,7 +74,7 @@ def _cabocha_parsable(result: str):
 
 def _mecab_parsable(result: str):
     for line in result.split("\n"):
-        if line.strip() in ("", "EOS"):
+        if line.strip() in ("", "EOS") or line.startswith("#"):
             continue
         if not len(line.split("\t")) == 2:
             raise Exception
@@ -210,6 +210,13 @@ def test_output_format(self, output_format, result_parsable, input_file):
         assert p.returncode == 0
         result_parsable(p.stdout.strip())
 
+    @pytest.mark.parametrize(
+        "hash_comment", ["print", "skip"]
+    )
+    def test_json_cannot_accept_hash_comment_not_analyze(self, hash_comment, input_file):
+        p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file])
+        assert p.returncode != 0
+
     def test_require_gpu(self, input_file):
         p = run_cmd(["ginza", "-g", input_file])
         gpu_available = int(os.environ.get("CUDA_VISIBLE_DEVICES", -1)) > 0

From cc9324241d027094ddb3617c72533f3060fc90d5 Mon Sep 17 00:00:00 2001
From: r-terada <r.terada1993@gmail.com>
Date: Tue, 23 Nov 2021 07:59:26 +0900
Subject: [PATCH 2/5] remove msg, according to assert about model_path

---
 ginza/command_line.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ginza/command_line.py b/ginza/command_line.py
index 12df5be..046e86a 100644
--- a/ginza/command_line.py
+++ b/ginza/command_line.py
@@ -62,7 +62,7 @@ def run(
     files: List[str] = None,
 ):
     assert model_path is None or ensure_model is None
-    assert not (output_format in ["3", "json"] and hash_comment != "analyze"), "output_format=json only accept hash_comment=analyze"
+    assert not (output_format in ["3", "json"] and hash_comment != "analyze")
 
     if parallel_level <= 0:
         level = max(1, cpu_count() + parallel_level)

From 3f0d518beb0b9957a4225a88725290cc5a08e7fe Mon Sep 17 00:00:00 2001
From: r-terada <r.terada1993@gmail.com>
Date: Tue, 23 Nov 2021 09:07:16 +0900
Subject: [PATCH 3/5] stop using assert, only warning

---
 ginza/command_line.py            |  7 ++++++-
 ginza/tests/test_command_line.py | 11 ++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/ginza/command_line.py b/ginza/command_line.py
index 046e86a..056e68a 100644
--- a/ginza/command_line.py
+++ b/ginza/command_line.py
@@ -62,7 +62,12 @@ def run(
     files: List[str] = None,
 ):
     assert model_path is None or ensure_model is None
-    assert not (output_format in ["3", "json"] and hash_comment != "analyze")
+    if output_format in ["3", "json"] and hash_comment != "analyze":
+        print(
+            f'hash_comment={hash_comment} may break output json if input contains a line starts with "#".\n'
+            'In order to keep the json in proper format, please use hash_comment=analyze or remove the lines start with "#" from input.',
+            file=sys.stderr
+        )
 
     if parallel_level <= 0:
         level = max(1, cpu_count() + parallel_level)
diff --git a/ginza/tests/test_command_line.py b/ginza/tests/test_command_line.py
index efefc85..b9ea14e 100644
--- a/ginza/tests/test_command_line.py
+++ b/ginza/tests/test_command_line.py
@@ -213,9 +213,14 @@ def test_output_format(self, output_format, result_parsable, input_file):
     @pytest.mark.parametrize(
         "hash_comment", ["print", "skip"]
     )
-    def test_json_cannot_accept_hash_comment_not_analyze(self, hash_comment, input_file):
-        p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file])
-        assert p.returncode != 0
+    def test_warn_if_json_hash_comment_not_analyze(self, hash_comment, input_file):
+        p = run_cmd(["ginza", "-c", hash_comment, "-f", "json", input_file], stderr=sp.PIPE)
+        assert p.returncode == 0
+        msg = (
+            f'hash_comment={hash_comment} may break output json if input contains a line starts with "#".\n'
+            'In order to keep the json in proper format, please use hash_comment=analyze or remove the lines start with "#" from input.'
+        )
+        assert msg in p.stderr
 
     def test_require_gpu(self, input_file):
         p = run_cmd(["ginza", "-g", input_file])

From d07602e0a75e75e9fd2032ba94b376dfc08d1aff Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Date: Tue, 23 Nov 2021 11:30:49 +0900
Subject: [PATCH 4/5] change warning message

---
 ginza/command_line.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ginza/command_line.py b/ginza/command_line.py
index 056e68a..736fe6a 100644
--- a/ginza/command_line.py
+++ b/ginza/command_line.py
@@ -64,8 +64,7 @@ def run(
     assert model_path is None or ensure_model is None
     if output_format in ["3", "json"] and hash_comment != "analyze":
         print(
-            f'hash_comment={hash_comment} may break output json if input contains a line starts with "#".\n'
-            'In order to keep the json in proper format, please use hash_comment=analyze or remove the lines start with "#" from input.',
+            f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyzer".',
             file=sys.stderr
         )
 

From 73972b45e1f0f6f399893d72b86d0c72921a040a Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Date: Tue, 23 Nov 2021 11:33:15 +0900
Subject: [PATCH 5/5] typo

---
 ginza/command_line.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ginza/command_line.py b/ginza/command_line.py
index 736fe6a..8db72e6 100644
--- a/ginza/command_line.py
+++ b/ginza/command_line.py
@@ -64,7 +64,7 @@ def run(
     assert model_path is None or ensure_model is None
     if output_format in ["3", "json"] and hash_comment != "analyze":
         print(
-            f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyzer".',
+            f'hash_comment="{hash_comment}" not permitted for JSON output. Forced to use hash_comment="analyze".',
             file=sys.stderr
         )