-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathsegmentation.py
54 lines (48 loc) · 1.91 KB
/
segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-
__author__ = "ALEX-CHUN-YU (P76064538@mail.ncku.edu.tw)"
import jieba
import logging
from hanziconv import HanziConv
# 進行斷詞並過濾 stopword
class Segmentation(object):
def __init__(self):
# 用默認 Formatter 為日誌系統建立一個 StreamHandler ,設置基礎配置並加到 root logger 中
logging.basicConfig(format = "%(asctime)s : %(levelname)s : %(message)s", level = logging.INFO)
self.stopwordset = set()
# 讀取 stopword 辭典,並存到 stopwordset
def set_stopword(self):
with open("stopwords.txt", "r", encoding = "utf-8") as stopwords:
for stopword in stopwords:
self.stopwordset.add(stopword.strip('\n'))
#print(self.stopwordset)
print("StopWord Set 已儲存!")
# 簡 to 繁
def simplified_to_traditional(self):
logging.info("等待中..(簡 to 繁)")
traditional = open("traditional.txt", "w", encoding = "utf-8")
with open("wiki_text.txt", "r", encoding = "utf-8") as simplified:
for s in simplified:
traditional.write(HanziConv.toTraditional(s))
print("成功簡體轉繁體!")
traditional.close()
# 斷詞(Segmentation)並過濾掉停用詞(Stop Word)
def segmentation(self):
logging.info("等待中..(jieba 斷詞,並過濾停用詞)")
segmentation = open("segmentation.txt", "w", encoding = "utf-8")
with open("traditional.txt", "r", encoding = "utf-8") as Corpus:
for sentence in Corpus:
sentence = sentence.strip("\n")
pos = jieba.cut(sentence, cut_all = False)
for term in pos:
if term not in self.stopwordset:
segmentation.write(term + " ")
print("jieba 斷詞完畢,並已完成過濾停用詞!")
segmentation.close()
if __name__ == "__main__":
segmentation = Segmentation()
# 讀取停用詞辭典
segmentation.set_stopword()
# data 進行簡體轉繁體
segmentation.simplified_to_traditional()
# 進行 jieba 斷詞同步過濾停用詞,並產生辭典
segmentation.segmentation()