Skip to content

Commit

Permalink
style: format all files according to black
Browse files Browse the repository at this point in the history
  • Loading branch information
new5558 committed Oct 25, 2022
1 parent c7358da commit e07673f
Show file tree
Hide file tree
Showing 81 changed files with 1,113 additions and 994 deletions.
Binary file added bin/act
Binary file not shown.
11 changes: 5 additions & 6 deletions pythainlp/augment/lm/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@ class FastTextAug:
:param str model_path: path of model file
"""

def __init__(self, model_path: str):
"""
:param str model_path: path of model file
"""
if model_path.endswith('.bin'):
if model_path.endswith(".bin"):
self.model = FastText_gensim.load_facebook_vectors(model_path)
elif model_path.endswith('.vec'):
elif model_path.endswith(".vec"):
self.model = KeyedVectors.load_word2vec_format(model_path)
else:
self.model = FastText_gensim.load(model_path)
Expand All @@ -33,7 +34,7 @@ def tokenize(self, text: str) -> List[str]:
:return: list of word
:rtype: List[str]
"""
return word_tokenize(text, engine='icu')
return word_tokenize(text, engine="icu")

def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
"""
Expand All @@ -44,9 +45,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
list_sent_new = []
for i in sent:
if i in self.dict_wv:
w = [
j for j, v in self.model.most_similar(i) if v >= p
]
w = [j for j, v in self.model.most_similar(i) if v >= p]
if w == []:
list_sent_new.append([i])
else:
Expand Down
28 changes: 13 additions & 15 deletions pythainlp/augment/lm/wangchanberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,26 @@ def __init__(self):
self.model_name = "airesearch/wangchanberta-base-att-spm-uncased"
self.target_tokenizer = CamembertTokenizer
self.tokenizer = CamembertTokenizer.from_pretrained(
self.model_name,
revision='main')
self.model_name, revision="main"
)
self.tokenizer.additional_special_tokens = [
'<s>NOTUSED',
'</s>NOTUSED',
'<_>'
"<s>NOTUSED",
"</s>NOTUSED",
"<_>",
]
self.fill_mask = pipeline(
task='fill-mask',
task="fill-mask",
tokenizer=self.tokenizer,
model=f'{self.model_name}',
revision='main'
model=f"{self.model_name}",
revision="main",
)
self.MASK_TOKEN = self.tokenizer.mask_token

def generate(self, sentence: str, num_replace_tokens: int = 3):
self.sent2 = []
self.input_text = sentence
sent = [
i for i in self.tokenizer.tokenize(self.input_text) if i != '▁'
i for i in self.tokenizer.tokenize(self.input_text) if i != "▁"
]
if len(sent) < num_replace_tokens:
num_replace_tokens = len(sent)
Expand All @@ -42,18 +42,16 @@ def generate(self, sentence: str, num_replace_tokens: int = 3):
replace_token = [
sent.pop(random.randrange(len(sent))) for _ in range(1)
][0]
masked_text = masked_text+self.MASK_TOKEN
masked_text = masked_text + self.MASK_TOKEN
self.sent2 += [
str(j['sequence']).replace('<s> ', '').replace('</s>', '')
str(j["sequence"]).replace("<s> ", "").replace("</s>", "")
for j in self.fill_mask(masked_text)
if j['sequence'] not in self.sent2
if j["sequence"] not in self.sent2
]
masked_text = self.input_text
return self.sent2

def augment(
self, sentence: str, num_replace_tokens: int = 3
) -> List[str]:
def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]:
"""
Text Augment from wangchanberta
Expand Down
6 changes: 1 addition & 5 deletions pythainlp/augment/word2vec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@
Word2Vec
"""

__all__ = [
"Word2VecAug",
"Thai2fitAug",
"LTW2VAug"
]
__all__ = ["Word2VecAug", "Thai2fitAug", "LTW2VAug"]

from pythainlp.augment.word2vec.core import Word2VecAug
from pythainlp.augment.word2vec.thai2fit import Thai2fitAug
Expand Down
4 changes: 3 additions & 1 deletion pythainlp/augment/word2vec/bpemb_wv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ class BPEmbAug:
BPEmb:
`github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
"""

def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
from bpemb import BPEmb

self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
self.model = self.bpemb_temp.emb
self.load_w2v()
Expand Down Expand Up @@ -58,6 +60,6 @@ def augment(
for i in self.temp:
self.t = ""
for j in i:
self.t += j.replace('▁', '')
self.t += j.replace("▁", "")
self.temp_new.append(self.t)
return self.temp_new
12 changes: 4 additions & 8 deletions pythainlp/augment/word2vec/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ def __init__(
:param str type: moodel type (file, binary)
"""
import gensim.models.keyedvectors as word2vec

self.tokenizer = tokenize
if type == "file":
self.model = word2vec.KeyedVectors.load_word2vec_format(model)
elif type == "binary":
self.model = word2vec.KeyedVectors.load_word2vec_format(
model, binary=True, unicode_errors='ignore'
model, binary=True, unicode_errors="ignore"
)
else:
self.model = model
Expand All @@ -33,9 +34,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
list_sent_new = []
for i in sent:
if i in self.dict_wv:
w = [
j for j, v in self.model.most_similar(i) if v >= p
]
w = [j for j, v in self.model.most_similar(i) if v >= p]
if w == []:
list_sent_new.append([i])
else:
Expand All @@ -45,10 +44,7 @@ def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
return list_sent_new

def augment(
self,
sentence: str,
n_sent: int = 1,
p: float = 0.7
self, sentence: str, n_sent: int = 1, p: float = 0.7
) -> List[Tuple[str]]:
"""
:param str sentence: text sentence
Expand Down
10 changes: 4 additions & 6 deletions pythainlp/augment/word2vec/ltw2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@ class LTW2VAug:
LTW2V:
`github.com/PyThaiNLP/large-thaiword2vec <https://github.com/PyThaiNLP/large-thaiword2vec>`_
"""

def __init__(self):
self.ltw2v_wv = get_corpus_path('ltw2v')
self.ltw2v_wv = get_corpus_path("ltw2v")
self.load_w2v()

def tokenizer(self, text: str) -> List[str]:
"""
:param str text: thai text
:rtype: List[str]
"""
return word_tokenize(text, engine='newmm')
return word_tokenize(text, engine="newmm")

def load_w2v(self): # insert substitute
"""
Expand All @@ -30,10 +31,7 @@ def load_w2v(self): # insert substitute
self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary")

def augment(
self,
sentence: str,
n_sent: int = 1,
p: float = 0.7
self, sentence: str, n_sent: int = 1, p: float = 0.7
) -> List[Tuple[str]]:
"""
Text Augment using word2vec from Thai2Fit
Expand Down
8 changes: 3 additions & 5 deletions pythainlp/augment/word2vec/thai2fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ class Thai2fitAug:
Thai2Fit:
`github.com/cstorm125/thai2fit <https://github.com/cstorm125/thai2fit>`_
"""

def __init__(self):
self.thai2fit_wv = get_corpus_path('thai2fit_wv')
self.thai2fit_wv = get_corpus_path("thai2fit_wv")
self.load_w2v()

def tokenizer(self, text: str) -> List[str]:
Expand All @@ -30,10 +31,7 @@ def load_w2v(self):
self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")

def augment(
self,
sentence: str,
n_sent: int = 1,
p: float = 0.7
self, sentence: str, n_sent: int = 1, p: float = 0.7
) -> List[Tuple[str]]:
"""
Text Augment using word2vec from Thai2Fit
Expand Down
14 changes: 6 additions & 8 deletions pythainlp/augment/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def postype2wordnet(pos: str, corpus: str):
**Options for corpus**
* *orchid* - Orchid Corpus
"""
if corpus not in ['orchid']:
if corpus not in ["orchid"]:
return None
return orchid[pos]

Expand All @@ -116,14 +116,12 @@ class WordNetAug:
"""
Text Augment using wordnet
"""

def __init__(self):
pass

def find_synonyms(
self,
word: str,
pos: str = None,
postag_corpus: str = "orchid"
self, word: str, pos: str = None, postag_corpus: str = "orchid"
) -> List[str]:
"""
Find synonyms from wordnet
Expand All @@ -139,13 +137,13 @@ def find_synonyms(
self.list_synsets = wordnet.synsets(word)
else:
self.p2w_pos = postype2wordnet(pos, postag_corpus)
if self.p2w_pos != '':
if self.p2w_pos != "":
self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
else:
self.list_synsets = wordnet.synsets(word)

for self.synset in wordnet.synsets(word):
for self.syn in self.synset.lemma_names(lang='tha'):
for self.syn in self.synset.lemma_names(lang="tha"):
self.synonyms.append(self.syn)

self.synonyms_without_duplicates = list(
Expand All @@ -159,7 +157,7 @@ def augment(
tokenize: object = word_tokenize,
max_syn_sent: int = 6,
postag: bool = True,
postag_corpus: str = "orchid"
postag_corpus: str = "orchid",
) -> List[List[str]]:
"""
Text Augment using wordnet
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/benchmarks/word_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
"word_level": {
"correctly_tokenised_words": correctly_tokenised_words,
"total_words_in_sample": np.sum(sample),
"total_words_in_ref_sample": np.sum(ref_sample)
"total_words_in_ref_sample": np.sum(ref_sample),
},
"global": {
"tokenisation_indicators": "".join(tokenization_indicators)
Expand Down
12 changes: 9 additions & 3 deletions pythainlp/cli/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def get(self, argv):
usage="thainlp data get <dataset_name>",
)
parser.add_argument(
"dataset_name", type=str, help="dataset/corpus's name",
"dataset_name",
type=str,
help="dataset/corpus's name",
)
args = parser.parse_args(argv[3:])
if corpus.download(args.dataset_name):
Expand All @@ -60,7 +62,9 @@ def rm(self, argv):
usage="thainlp data rm <dataset_name>",
)
parser.add_argument(
"dataset_name", type=str, help="dataset/corpus's name",
"dataset_name",
type=str,
help="dataset/corpus's name",
)
args = parser.parse_args(argv[3:])
if corpus.remove(args.dataset_name):
Expand All @@ -74,7 +78,9 @@ def info(self, argv):
usage="thainlp data info <dataset_name>",
)
parser.add_argument(
"dataset_name", type=str, help="dataset/corpus's name",
"dataset_name",
type=str,
help="dataset/corpus's name",
)
args = parser.parse_args(argv[3:])
info = corpus.get_corpus_db_detail(args.dataset_name)
Expand Down
4 changes: 3 additions & 1 deletion pythainlp/cli/soundex.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def __init__(self, argv):
default=DEFAULT_SOUNDEX_ENGINE,
)
parser.add_argument(
"text", type=str, help="input text",
"text",
type=str,
help="input text",
)

args = parser.parse_args(argv[2:])
Expand Down
4 changes: 3 additions & 1 deletion pythainlp/cli/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ class SubAppBase:
def __init__(self, name, argv):
parser = argparse.ArgumentParser(**cli.make_usage("tag " + name))
parser.add_argument(
"text", type=str, help="input text",
"text",
type=str,
help="input text",
)
parser.add_argument(
"-s",
Expand Down
9 changes: 7 additions & 2 deletions pythainlp/cli/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ class SubAppBase:
def __init__(self, name, argv):
parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
parser.add_argument(
"text", type=str, nargs="?", help="input text",
"text",
type=str,
nargs="?",
help="input text",
)
parser.add_argument(
"-s",
Expand Down Expand Up @@ -122,7 +125,9 @@ def __init__(self, argv):
),
)
parser.add_argument(
"token_type", type=str, help="[subword|word|sent]",
"token_type",
type=str,
help="[subword|word|sent]",
)

args = parser.parse_args(argv[2:3])
Expand Down
4 changes: 1 addition & 3 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@
_CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE")

# remote corpus catalog URL
_CORPUS_DB_URL = (
"https://pythainlp.github.io/pythainlp-corpus/db.json"
)
_CORPUS_DB_URL = "https://pythainlp.github.io/pythainlp-corpus/db.json"

# local corpus catalog filename
_CORPUS_DB_FILENAME = "db.json"
Expand Down
Loading

0 comments on commit e07673f

Please sign in to comment.