-
Notifications
You must be signed in to change notification settings - Fork 16
/
train_w2v.py
29 lines (28 loc) · 1.09 KB
/
train_w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from gensim.models import Word2Vec
import logging
from gensim.models import word2vec
logging.basicConfig(
format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
sent=word2vec.Text8Corpus("/home/kesci/work/zhifeng/corpus.csv")
word2vecModel = word2vec.Word2Vec(sent, size=300, window=5, min_count=1,iter=5,
sg=1,workers=8)
word2vecModel.save(out+"skip_w2v_all_300.model")
# ##### further train
from gensim.models import word2vec
model = word2vec.Word2Vec.load(out+"skip_w2v_all_300.model")
fout = open(out + "new_corpus.csv",'w')
with open(path+"bytedance_contest.final_2.csv",'r') as fin:
q_last = ''
for line in tqdm(fin):
_,q,_,t = line.strip().split(',')
if q!=q_last:
q_last = q
fout.write(q + '\n')
fout.write(t + '\n')
fout.close()
logging.basicConfig(
format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
sent=word2vec.Text8Corpus(out + "new_corpus.csv")
model.build_vocab(sent, update=True)
model.train(sent,total_examples=model.corpus_count, epochs=5)
model.save(out+"new_skip_w2v_all_300.model")