Skip to content

Commit

Permalink
Merge pull request #968 from bact/add-cli-tests
Browse files Browse the repository at this point in the history
Make CLI able to handle Unicode characters output on Windows console
  • Loading branch information
bact authored Nov 2, 2024
2 parents 974b153 + 252e64e commit cf6997f
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 80 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ jobs:
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
run: pip install -r docker_requirements.txt
- name: Install PyThaiNLP
env:
PYTHONIOENCODING: utf-8
run: pip install .
# If you want to install a safe small set of optional dependencies, use:
# pip install .[compact]
Expand Down
4 changes: 4 additions & 0 deletions pythainlp/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
# SPDX-License-Identifier: Apache-2.0
"""Command line helpers."""

import io
import sys
from argparse import ArgumentParser

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8")

# a command should start with a verb when possible
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])

Expand Down
20 changes: 13 additions & 7 deletions pythainlp/cli/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pythainlp import cli
from pythainlp.benchmarks import word_tokenization
from pythainlp.tools import safe_print


def _read_file(path):
Expand Down Expand Up @@ -81,7 +82,7 @@ def __init__(self, name, argv):
expected
), "Input and test files do not have the same number of samples"

print(
safe_print(
"Benchmarking %s against %s with %d samples in total"
% (args.input_file, args.test_file, len(actual))
)
Expand Down Expand Up @@ -121,12 +122,12 @@ def __init__(self, name, argv):
/ statistics["word_level:total_words_in_ref_sample"]
)

print("============== Benchmark Result ==============")
safe_print("============== Benchmark Result ==============")

for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
c = f"char_level:{c}"
v = statistics[c]
print(f"{c:>40s} {v:.4f}")
safe_print(f"{c:>40s} {v:.4f}")

for c in [
"total_words_in_sample",
Expand All @@ -137,20 +138,20 @@ def __init__(self, name, argv):
]:
c = f"word_level:{c}"
v = statistics[c]
print(f"{c:>40s} {v:.4f}")
safe_print(f"{c:>40s} {v:.4f}")

if args.save_details:
dir_name = os.path.dirname(args.input_file)
file_name = args.input_file.split("/")[-1].split(".")[0]

res_path = "%s/eval-%s.yml" % (dir_name, file_name)
print("Evaluation result is saved to %s" % res_path)
safe_print("Evaluation result is saved to %s" % res_path)

with open(res_path, "w", encoding="utf-8") as outfile:
yaml.dump(statistics, outfile, default_flow_style=False)

res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
print("Details of comparisons is saved to %s" % res_path)
safe_print("Details of comparisons is saved to %s" % res_path)

with open(res_path, "w", encoding="utf-8") as f:
samples = []
Expand All @@ -160,7 +161,12 @@ def __init__(self, name, argv):
del r["actual"]

samples.append(
{"metrics": r, "expected": expected, "actual": actual, "id": i}
{
"metrics": r,
"expected": expected,
"actual": actual,
"id": i,
}
)

details = {"metrics": statistics, "samples": samples}
Expand Down
1 change: 1 addition & 0 deletions pythainlp/cli/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
Command line for PyThaiNLP's dataset/corpus management.
"""

import argparse

from pythainlp import corpus
Expand Down
5 changes: 4 additions & 1 deletion pythainlp/cli/soundex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
It takes input text from the command line.
"""

import argparse

from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE, soundex
from pythainlp.tools import safe_print


class App:
Expand Down Expand Up @@ -47,4 +49,5 @@ def __init__(self, argv):
args = parser.parse_args(argv[2:])

sdx = soundex(args.text, engine=args.algorithm)
print(sdx)

safe_print(sdx)
4 changes: 3 additions & 1 deletion pythainlp/cli/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
"""
Command line for PyThaiNLP's taggers.
"""

import argparse

from pythainlp import cli
from pythainlp.tag import pos_tag
from pythainlp.tools import safe_print


class SubAppBase:
Expand All @@ -34,7 +36,7 @@ def __init__(self, name, argv):
result = self.run(tokens)

for word, tag in result:
print(word, "/", tag)
safe_print(word + " / " + tag)


class POSTaggingApp(SubAppBase):
Expand Down
5 changes: 3 additions & 2 deletions pythainlp/cli/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
subword_tokenize,
word_tokenize,
)
from pythainlp.tools import safe_print

DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
Expand Down Expand Up @@ -71,7 +72,7 @@ def __init__(self, name, argv):
engine=args.algorithm,
keep_whitespace=args.keep_whitespace,
)
print(args.separator.join(result) + args.separator)
safe_print(args.separator.join(result) + args.separator)


class WordTokenizationApp(SubAppBase):
Expand Down Expand Up @@ -144,4 +145,4 @@ def __init__(self, argv):
elif token_type.startswith("se"):
SentenceTokenizationApp("sent", argv)
else:
print(f"Token type not available: {token_type}")
safe_print(f"Token type not available: {token_type}")
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# Names of module to be tested
test_packages: list[str] = [
"tests.test_ancient",
# "tests.test_cli",
"tests.test_cli",
# "tests.test_corpus",
"tests.test_morpheme",
"tests.test_soundex",
Expand Down
95 changes: 27 additions & 68 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@

import unittest
from argparse import ArgumentError
from types import ModuleType

from pythainlp import __main__, cli
from pythainlp.cli.data import App as DataApp
from pythainlp.cli.soundex import App as SoundexApp
from pythainlp.cli.tag import App as TagApp
from pythainlp.cli.tokenize import App as TokenizeApp


class CliTestCase(unittest.TestCase):
Expand All @@ -26,68 +29,40 @@ def test_cli_main(self):

self.assertIsNone(__main__.main(["thainlp", "data", "path"]))

def test_cli_benchmark(self):
self.assertIsInstance(getattr(cli, "benchmark"), ModuleType)

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "benchmark"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
cli.benchmark.App(
[
"thainlp",
"benchmark",
"word-tokenization",
"--input-file",
"./tests/data/input.txt",
"--test-file",
"./tests/data/test.txt",
"--save-details",
]
)
)

def test_cli_data(self):
self.assertIsInstance(getattr(cli, "data"), ModuleType)
self.assertTrue(hasattr(cli, "data"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "data"])
DataApp(["thainlp", "data"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(cli.data.App(["thainlp", "data", "catalog"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "path"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "get", "test"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "info", "test"]))
self.assertIsNotNone(cli.data.App(["thainlp", "data", "rm", "test"]))
self.assertIsNotNone(
cli.data.App(["thainlp", "data", "get", "NOT_EXIST"])
)
self.assertIsNotNone(
cli.data.App(["thainlp", "data", "info", "NOT_EXIST"])
)
self.assertIsNotNone(
cli.data.App(["thainlp", "data", "rm", "NOT_EXIST"])
)
self.assertIsNotNone(DataApp(["thainlp", "data", "catalog"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "path"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "get", "test"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "info", "test"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "test"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "get", "NOT_EXIST"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "info", "NOT_EXIST"]))
self.assertIsNotNone(DataApp(["thainlp", "data", "rm", "NOT_EXIST"]))

def test_cli_soundex(self):
self.assertIsInstance(getattr(cli, "soundex"), ModuleType)
self.assertTrue(hasattr(cli, "soundex"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "soundex"])
DataApp(["thainlp", "soundex"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(cli.soundex.App(["thainlp", "soundex", "ทดสอบ"]))
self.assertIsNotNone(SoundexApp(["thainlp", "soundex", "ทดสอบ"]))

def test_cli_tag(self):
self.assertIsInstance(getattr(cli, "tag"), ModuleType)
self.assertTrue(hasattr(cli, "tag"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "tag"])
DataApp(["thainlp", "tag"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
cli.tag.App(
TagApp(
[
"thainlp",
"tag",
Expand All @@ -99,7 +74,7 @@ def test_cli_tag(self):
)
)
self.assertIsNotNone(
cli.tag.App(
TagApp(
[
"thainlp",
"tag",
Expand All @@ -112,17 +87,17 @@ def test_cli_tag(self):
)

def test_cli_tokenize(self):
self.assertIsInstance(getattr(cli, "tokenize"), ModuleType)
self.assertTrue(hasattr(cli, "tokenize"))

with self.assertRaises(SystemExit) as ex:
cli.data.App(["thainlp", "tokenize"])
DataApp(["thainlp", "tokenize"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
cli.tokenize.App(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
TokenizeApp(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
)
self.assertIsNotNone(
cli.tokenize.App(
TokenizeApp(
[
"thainlp",
"tokenize",
Expand All @@ -134,7 +109,7 @@ def test_cli_tokenize(self):
)
)
self.assertIsNotNone(
cli.tokenize.App(
TokenizeApp(
[
"thainlp",
"tokenize",
Expand All @@ -147,7 +122,7 @@ def test_cli_tokenize(self):
)
)
self.assertIsNotNone(
cli.tokenize.App(
TokenizeApp(
[
"thainlp",
"tokenize",
Expand All @@ -161,19 +136,3 @@ def test_cli_tokenize(self):
]
)
)
self.assertIsNotNone(
cli.tokenize.App(
[
"thainlp",
"tokenize",
"sent",
"-s",
"|",
(
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
"กระสุนสำหรับสมองของคุณวันนี้"
"แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
),
]
)
)
52 changes: 52 additions & 0 deletions tests/testx_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

import unittest

from pythainlp import __main__, cli
from pythainlp.cli.benchmark import App as BenchmarkApp
from pythainlp.cli.data import App as DataApp
from pythainlp.cli.tokenize import App as TokenizeApp


class CliTestCaseX(unittest.TestCase):
def test_cli_benchmark(self):
self.assertTrue(hasattr(cli, "benchmark"))

with self.assertRaises(SystemExit) as ex:
DataApp(["thainlp", "benchmark"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
BenchmarkApp(
[
"thainlp",
"benchmark",
"word-tokenization",
"--input-file",
"./tests/data/input.txt",
"--test-file",
"./tests/data/test.txt",
"--save-details",
]
)
)

def test_cli_tokenize(self):
self.assertIsNotNone(
TokenizeApp(
[
"thainlp",
"tokenize",
"sent",
"-s",
"|",
(
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
"กระสุนสำหรับสมองของคุณวันนี้"
"แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
),
]
)
)

0 comments on commit cf6997f

Please sign in to comment.