-
Notifications
You must be signed in to change notification settings - Fork 9
/
word2vector.py
103 lines (93 loc) · 3.24 KB
/
word2vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gensim
from gensim.models import Word2Vec
import pickle
import sys
import json
def id_freq():
id_freq = {}
with open('./who_is_h.wakati.withusername', 'r') as f:
for line in f:
line = line.strip()
ents = line.split()
ids = list(filter(lambda x:x[0]=="@", ents))
for id in ids:
if id_freq.get(id) is None: id_freq[id] = 0
id_freq[id] += 1
for id, freq in sorted(id_freq.items(), key=lambda x:x[1]*-1):
print(id, freq)
def train():
with open('who_is_h.wakati.nousername', 'r') as f:
tweets = []
for ti, tweet in enumerate(f):
if ti%10000 == 0:
print('now iter %d'%ti)
tweets.append( tweet.strip().split() )
model = Word2Vec(tweets, size=256, window=5, min_count=3, workers=8)
open('model.nousername.pkl', 'wb').write( pickle.dumps(model) )
def train_wikipedia():
with open('wikipedia.txt', 'r') as f:
texts = []
for ti, text in enumerate(f):
if ti%10000 == 0:
print('now iter %d'%ti)
if (ti+1)%10000000 == 0:
break
texts.append( text.strip().split() )
model = Word2Vec(texts, size=256, window=5, min_count=3, workers=8)
open('model.wikipedia.pkl', 'wb').write( pickle.dumps(model) )
def train_username():
words = set()
with open('who_is_h.wakati.withusername', 'r') as f:
tweets = []
for ti, tweet in enumerate(f):
if ti%10000 == 0:
print('now iter %d'%ti)
tweets.append( tweet.strip().split() )
[words.add(word) for word in tweet.strip().split() ]
open('words.withusername.pkl', 'wb').write( pickle.dumps(words, protocol=4) )
model = Word2Vec(tweets, size=256, window=5, min_count=1, workers=8)
open('model.withusername.pkl', 'wb').write( pickle.dumps(model, protocol=4) )
def pred():
model = pickle.loads(open('./model.word2vec.nousername.pkl?dl=0', 'rb').read())
while True:
words = input().split()
positive = list(filter(lambda x:x[0]!="-", words))
negative = list(map(lambda x:x.replace("-", ""), filter(lambda x:x[0]=="-", words) ) )
print(positive)
print(negative)
try:
tuples = model.wv.most_similar(positive=positive, negative=negative)
print( json.dumps(tuples, ensure_ascii=False, indent=2) )
except KeyError as e:
print( "キーが見つかりませんでした" )
def pred_wikipedia():
model = Word2Vec.load('../word2vec.gensim.model')
while True:
words = input().split()
try:
tuples = model.wv.most_similar(positive=words)
print( json.dumps(tuples, ensure_ascii=False, indent=2) )
except KeyError as e:
print( "キーが見つかりませんでした" )
def pred_user():
model = pickle.loads(open('./model.withusername.pkl', 'rb').read())
while True:
words = input().split()
try:
tuples = model.wv.most_similar(positive=words)
print( json.dumps(tuples, ensure_ascii=False, indent=2) )
except KeyError as e:
print( "キーが見つかりませんでした" )
if __name__ == '__main__':
if '--train' in sys.argv:
train()
if '--train_username' in sys.argv:
train_username()
if '--train_wikipedia' in sys.argv:
train_wikipedia()
if '--pred' in sys.argv:
pred()
if '--pred_user' in sys.argv:
pred_user()
if '--pred_wikipedia' in sys.argv:
pred_wikipedia()