diff --git a/docs/api/classify.rst b/docs/api/classify.rst new file mode 100644 index 000000000..f6ad6897b --- /dev/null +++ b/docs/api/classify.rst @@ -0,0 +1,7 @@ +.. currentmodule:: pythainlp.classify + +pythainlp.classify +============= + +.. autoclass:: GzipModel + :members: diff --git a/docs/api/cls.rst b/docs/api/cls.rst deleted file mode 100644 index 3a3f13426..000000000 --- a/docs/api/cls.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. currentmodule:: pythainlp.cls - -pythainlp.cls -============= - -.. autoclass:: GzipModel - :members: diff --git a/notebooks/clean-dict.ipynb b/notebooks/clean_dict.ipynb similarity index 100% rename from notebooks/clean-dict.ipynb rename to notebooks/clean_dict.ipynb diff --git a/notebooks/test-aksonhan.ipynb b/notebooks/test_aksonhan.ipynb similarity index 100% rename from notebooks/test-aksonhan.ipynb rename to notebooks/test_aksonhan.ipynb diff --git a/notebooks/test-chat.ipynb b/notebooks/test_chat.ipynb similarity index 100% rename from notebooks/test-chat.ipynb rename to notebooks/test_chat.ipynb diff --git a/notebooks/test_gzip_cls.ipynb b/notebooks/test_gzip_classify.ipynb similarity index 95% rename from notebooks/test_gzip_cls.ipynb rename to notebooks/test_gzip_classify.ipynb index 86666b355..573e7c059 100644 --- a/notebooks/test_gzip_cls.ipynb +++ b/notebooks/test_gzip_classify.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pythainlp.cls.param_free" + "import pythainlp.classify.param_free" ] }, { @@ -37,7 +37,7 @@ "metadata": {}, "outputs": [], "source": [ - "model = pythainlp.cls.param_free.GzipModel(training_data)" + "model = pythainlp.classify.param_free.GzipModel(training_data)" ] }, { diff --git a/notebooks/test-wangchanglm.ipynb b/notebooks/test_wangchanglm.ipynb similarity index 100% rename from notebooks/test-wangchanglm.ipynb rename to notebooks/test_wangchanglm.ipynb diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..737a5baf0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[tool.ruff] +line-length = 79 +indent-width = 4 +target-version = "py38" + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index 4026b3c23..43ae6c00a 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -10,24 +10,26 @@ def __init__(self): Chat using AI generation """ self.history = [] + def reset_chat(self): """ Reset chat by cleaning history """ self.history = [] + def load_model( self, - model_name:str="wangchanglm", - return_dict:bool=True, - load_in_8bit:bool=False, - device:str="cuda", + model_name: str = "wangchanglm", + return_dict: bool = True, + load_in_8bit: bool = False, + device: str = "cuda", torch_dtype=torch.float16, - offload_folder:str="./", - low_cpu_mem_usage:bool=True + offload_folder: str = "./", + low_cpu_mem_usage: bool = True, ): """ Load model - + :param str model_name: Model name (Now, we support wangchanglm only) :param bool return_dict: return_dict :param bool load_in_8bit: load model in 8bit @@ -38,6 +40,7 @@ def load_model( """ if model_name == "wangchanglm": from pythainlp.generate.wangchanglm import WangChanGLM + self.model = WangChanGLM() self.model.load_model( model_path="pythainlp/wangchanglm-7.5B-sft-en-sharded", @@ -46,14 +49,15 @@ def load_model( offload_folder=offload_folder, device=device, torch_dtype=torch_dtype, - low_cpu_mem_usage=low_cpu_mem_usage + low_cpu_mem_usage=low_cpu_mem_usage, ) else: raise NotImplementedError(f"We doesn't support {model_name}.") - def chat(self, text:str)->str: + + def chat(self, text: str) -> str: """ Chatbot - + :param str text: text for asking chatbot with. :return: answer from chatbot. :rtype: str @@ -72,11 +76,18 @@ def chat(self, text:str)->str: print(chatbot.history) # output: [('สวัสดี', 'ยินดีที่ได้รู้จัก')] """ - _temp="" + _temp = "" if self.history: - for h,b in self.history: - _temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":h,"bot":b})+self.model.stop_token - _temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":text,"bot":""}) + for h, b in self.history: + _temp += ( + self.model.PROMPT_DICT["prompt_chatbot"].format_map( + {"human": h, "bot": b} + ) + + self.model.stop_token + ) + _temp += self.model.PROMPT_DICT["prompt_chatbot"].format_map( + {"human": text, "bot": ""} + ) _bot = self.model.gen_instruct(_temp) - self.history.append((text,_bot)) + self.history.append((text, _bot)) return _bot diff --git a/pythainlp/classify/__init__.py b/pythainlp/classify/__init__.py new file mode 100644 index 000000000..0f36e8b6e --- /dev/null +++ b/pythainlp/classify/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +""" +pythainlp.classify +""" + +__all__ = ["GzipModel"] + +from pythainlp.classify.param_free import GzipModel diff --git a/pythainlp/cls/param_free.py b/pythainlp/classify/param_free.py similarity index 92% rename from pythainlp/cls/param_free.py rename to pythainlp/classify/param_free.py index 961d21087..1cfd3859d 100644 --- a/pythainlp/cls/param_free.py +++ b/pythainlp/classify/param_free.py @@ -9,7 +9,9 @@ class GzipModel: """ - This class is a re-implementation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023) + This class is a re-implementation of + “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors + (Jiang et al., Findings 2023) :param list training_data: list [(text_sample,label)] """ @@ -36,7 +38,7 @@ def predict(self, x1: str, k: int = 1) -> str: :Example: :: - from pythainlp.cls import GzipModel + from pythainlp.classify import GzipModel training_data = [ ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"), @@ -63,8 +65,10 @@ def predict(self, x1: str, k: int = 1) -> str: # normalized compression distance ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2) disance_from_x1.append(ncd) + sorted_idx = np.argsort(np.array(disance_from_x1)) top_k_class = self.training_data[sorted_idx[:k], 1] _, counts = np.unique(top_k_class, return_counts=True) predict_class = top_k_class[counts.argmax()] + return predict_class diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py index 473917c02..a48f6bf15 100644 --- a/pythainlp/cls/__init__.py +++ b/pythainlp/cls/__init__.py @@ -3,8 +3,14 @@ # SPDX-License-Identifier: Apache-2.0 """ pythainlp.cls +Depreciated. Use pythainlp.classify instead. """ +import warnings __all__ = ["GzipModel"] -from pythainlp.cls.param_free import GzipModel +from pythainlp.classify.param_free import GzipModel + +warnings.warn( + "Deprecated: Use pythainlp.classify instead.", DeprecationWarning +) diff --git a/pythainlp/coref/_fastcoref.py b/pythainlp/coref/_fastcoref.py index 5529efe3e..4128ed780 100644 --- a/pythainlp/coref/_fastcoref.py +++ b/pythainlp/coref/_fastcoref.py @@ -6,7 +6,13 @@ class FastCoref: - def __init__(self, model_name, nlp=spacy.blank("th"), device: str="cpu", type: str="FCoref") -> None: + def __init__( + self, + model_name, + nlp=spacy.blank("th"), + device: str = "cpu", + type: str = "FCoref", + ) -> None: if type == "FCoref": from fastcoref import FCoref as _model else: @@ -17,11 +23,12 @@ def __init__(self, model_name, nlp=spacy.blank("th"), device: str="cpu", type: s def _to_json(self, _predict): return { - "text":_predict.text, - "clusters_string":_predict.get_clusters(as_strings=True), - "clusters":_predict.get_clusters(as_strings=False) + "text": _predict.text, + "clusters_string": _predict.get_clusters(as_strings=True), + "clusters": _predict.get_clusters(as_strings=False), } - - def predict(self, texts: List[str]) -> dict: - return [self._to_json(i) for i in self.model.predict(texts=texts)] + def predict(self, texts: List[str]) -> List[dict]: + return [ + self._to_json(pred) for pred in self.model.predict(texts=texts) + ] diff --git a/pythainlp/coref/core.py b/pythainlp/coref/core.py index 386729e60..c498593b9 100644 --- a/pythainlp/coref/core.py +++ b/pythainlp/coref/core.py @@ -2,21 +2,26 @@ # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 from typing import List + model = None -def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", device:str="cpu"): +def coreference_resolution( + texts: List[str], model_name: str = "han-coref-v1.0", device: str = "cpu" +): """ Coreference Resolution :param List[str] texts: list of texts to apply coreference resolution to :param str model_name: coreference resolution model - :param str device: device for running coreference resolution model on (cpu, cuda, and others) + :param str device: device for running coreference resolution model on\ + ("cpu", "cuda", and others) :return: List of texts with coreference resolution :rtype: List[dict] :Options for model_name: - * *han-coref-v1.0* - (default) Han-Corf: Thai oreference resolution by PyThaiNLP v1.0 + * *han-coref-v1.0* - (default) Han-Coref: Thai coreference resolution\ + by PyThaiNLP v1.0 :Example: :: @@ -30,15 +35,23 @@ def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", dev ) # output: # [ - # {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก', - # 'clusters_string': [['Bill Gates', 'ผม']], + # {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก', + # 'clusters_string': [['Bill Gates', 'ผม']], # 'clusters': [[(0, 10), (50, 52)]]} # ] """ global model if isinstance(texts, str): texts = [texts] - if model is None and model_name=="han-coref-v1.0": + + if model is None and model_name == "han-coref-v1.0": from pythainlp.coref.han_coref import HanCoref + model = HanCoref(device=device) - return model.predict(texts) + + if model: + return model.predict(texts) + + return [ + {"text": text, "clusters_string": [], "clusters": []} for text in texts + ] diff --git a/pythainlp/coref/han_coref.py b/pythainlp/coref/han_coref.py index d8b7460d5..60e068d09 100644 --- a/pythainlp/coref/han_coref.py +++ b/pythainlp/coref/han_coref.py @@ -6,9 +6,7 @@ class HanCoref(FastCoref): - def __init__(self,device:str="cpu",nlp=spacy.blank("th")) -> None: + def __init__(self, device: str = "cpu", nlp=spacy.blank("th")) -> None: super().__init__( - model_name="pythainlp/han-coref-v1.0", - device=device, - nlp=nlp + model_name="pythainlp/han-coref-v1.0", device=device, nlp=nlp ) diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index aff1c374e..602e22b48 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -10,17 +10,17 @@ __all__ = [ "PerceptronTagger", + "NER", + "NNER", + "chunk_parse", "pos_tag", "pos_tag_sents", + "pos_tag_transformers", "tag_provinces", - "chunk_parse", - "NER", - "NNER", - "pos_tag_transformers" ] -from pythainlp.tag.locations import tag_provinces -from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers from pythainlp.tag._tag_perceptron import PerceptronTagger from pythainlp.tag.chunk import chunk_parse +from pythainlp.tag.locations import tag_provinces from pythainlp.tag.named_entity import NER, NNER +from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers diff --git a/tests/test_classify.py b/tests/test_classify.py new file mode 100644 index 000000000..c45049f7a --- /dev/null +++ b/tests/test_classify.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +import unittest +from pythainlp.classify import GzipModel + + +class TestClsPackage(unittest.TestCase): + def test_GzipModel(self): + training_data = [ + ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"), + ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"), + ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"), + ("ขับรถแย่มาก", "Negative"), + ("ดีนะครับ", "Positive"), + ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"), + ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"), + ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"), + ("นี่เป็นบทความหนึ่ง", "Neutral"), + ] + model = GzipModel(training_data) + self.assertEqual(model.predict("ฉันดีใจ", k=1), "Positive") diff --git a/tests/test_cls.py b/tests/test_cls.py deleted file mode 100644 index 984b5f4a4..000000000 --- a/tests/test_cls.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- -import unittest -from pythainlp.cls import GzipModel - - -class TestClsPackage(unittest.TestCase): - def test_GzipModel(self): - training_data = [ - ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"), - ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"), - ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"), - ("ขับรถแย่มาก", "Negative"), - ("ดีนะครับ", "Positive"), - ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"), - ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"), - ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"), - ("นี่เป็นบทความหนึ่ง", "Neutral") - ] - model = GzipModel(training_data) - self.assertEqual(model.predict("ฉันดีใจ", k=1), "Positive")