-
Notifications
You must be signed in to change notification settings - Fork 1
/
visualwords.py
132 lines (92 loc) · 2.81 KB
/
visualwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# coding: utf-8
from wordcloud import WordCloud
import re
from PIL import Image
import numpy as np
mask = np.array(Image.open('cat.png'))
mask = np.where(mask == 0, 0, 255)
with open('analyze_text.txt', 'r') as f:
text = f.read()
'''---------
a-Zを全て削除
---------'''
romaji = re.compile("[a-zA-Z]+")
text = romaji.sub("", text)
'''--------------------
3文字以下の カタカナ を削除
--------------------'''
found_katanaka_list = []
four_text_list = []
pos = 0
katanaka_pattern = re.compile('[ァ-ヴ]+')
while True:
match1 = katanaka_pattern.search( text, pos )
if match1 == None:
break
# 見つかったカタカナの後からループ開始
pos = match1.end( 0 )
found_katanaka_list.append(match1[0])
for katakana_words in found_katanaka_list:
# 文字数指定
if len(katakana_words) >= 4:
four_text_list.append(katakana_words)
text = katanaka_pattern.sub(" ", text)
for katakana in four_text_list:
text += " " + katakana + " "
'''--------------------
4文字以下の ひらがな を削除
--------------------'''
found_hiragana_list = []
five_text_list = []
pos = 0
hiragana_pattern = re.compile('[ぁ-ん]+')
while True:
match2 = hiragana_pattern.search( text, pos )
if match2 == None:
break
pos = match2.end( 0 )
found_hiragana_list.append(match2[0])
for hiragana_words in found_hiragana_list:
# 文字数指定
if len(hiragana_words) >= 5:
five_text_list.append(hiragana_words)
text = hiragana_pattern.sub(" ", text)
for hiragana in five_text_list:
text += " " + hiragana + " "
'''----------------
2文字以下の 漢字 を削除
----------------'''
found_kanzi_list = []
three_text_list = []
pos = 0
kanzi_pattern = re.compile('[一-龥]+')
while True:
match3 = kanzi_pattern.search( text, pos )
if match3 == None:
break
pos = match3.end( 0 )
found_kanzi_list.append(match3[0])
for kanzi_words in found_kanzi_list:
# 文字数指定
if len(kanzi_words) >= 3:
three_text_list.append(kanzi_words)
text = kanzi_pattern.sub(" ", text)
for kanzi in three_text_list:
text += " " + kanzi + " "
'''--------------
任意の削除したい単語
--------------'''
stop = ["ピヨピヨ", "ホゲホゲ"]
'''-----------
WordCloudの設定
-----------'''
wordcloud = WordCloud(mask = mask,
stopwords = stop,
# フォントパスを指定 以下はMac用
font_path="/system/Library/Fonts/ヒラギノ角ゴシック W4.ttc",
colormap = 'copper_r',
# background_color="white",
# contour_width = 1,
# contour_color='gray',
width=800, height=600).generate(text)
wordcloud.to_file("./wordcloud.png")