-
Notifications
You must be signed in to change notification settings - Fork 1
/
default_visualwords.py
123 lines (83 loc) · 2.45 KB
/
default_visualwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# coding: utf-8
from wordcloud import WordCloud
import re
with open('analyze_text.txt', 'r') as f:
text = f.read()
'''---------
a-Zを全て削除
---------'''
romaji = re.compile("[a-zA-Z]+")
text = romaji.sub("", text)
'''--------------------
3文字以下の カタカナ を削除
--------------------'''
found_katanaka_list = []
four_text_list = []
pos = 0
katanaka_pattern = re.compile('[ァ-ヴ]+')
while True:
match1 = katanaka_pattern.search( text, pos )
if match1 == None:
break
# 見つかったカタカナの後からループ開始
pos = match1.end( 0 )
found_katanaka_list.append(match1[0])
for katakana_words in found_katanaka_list:
# 文字数指定
if len(katakana_words) >= 4:
four_text_list.append(katakana_words)
text = katanaka_pattern.sub(" ", text)
for katakana in four_text_list:
text += " " + katakana + " "
'''--------------------
4文字以下の ひらがな を削除
--------------------'''
found_hiragana_list = []
five_text_list = []
pos = 0
hiragana_pattern = re.compile('[ぁ-ん]+')
while True:
match2 = hiragana_pattern.search( text, pos )
if match2 == None:
break
pos = match2.end( 0 )
found_hiragana_list.append(match2[0])
for hiragana_words in found_hiragana_list:
# 文字数指定
if len(hiragana_words) >= 5:
five_text_list.append(hiragana_words)
text = hiragana_pattern.sub(" ", text)
for hiragana in five_text_list:
text += " " + hiragana + " "
'''----------------
2文字以下の 漢字 を削除
----------------'''
found_kanzi_list = []
three_text_list = []
pos = 0
kanzi_pattern = re.compile('[一-龥]+')
while True:
match3 = kanzi_pattern.search( text, pos )
if match3 == None:
break
pos = match3.end( 0 )
found_kanzi_list.append(match3[0])
for kanzi_words in found_kanzi_list:
# 文字数指定
if len(kanzi_words) >= 3:
three_text_list.append(kanzi_words)
text = kanzi_pattern.sub(" ", text)
for kanzi in three_text_list:
text += " " + kanzi + " "
'''--------------
任意の削除したい単語
--------------'''
stop = ["ピヨピヨ", "ホゲホゲ"]
'''-----------
WordCloudの設定
-----------'''
wordcloud = WordCloud(background_color="white",
font_path="/system/Fonts/ヒラギノ角ゴシック W4.ttc",
stopwords = stop,
width=800, height=600).generate(text)
wordcloud.to_file("./wordcloud.png")