-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec.py
73 lines (62 loc) · 4.14 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# import modules & set up logging
import logging
import gensim, os, snowballstemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from string import ascii_lowercase
from nltk.stem.wordnet import WordNetLemmatizer
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# initialize stemmer
stemmer = snowballstemmer.EnglishStemmer()
# grab stopword list, extend it a bit, and then turn it into a set for later
stop = stopwords.words('english')
mapp = {'advice':'advise'}
stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
stop.extend(['using', 'way', 'would', 'use', 'i', 'don\'t', 'type', 'it', 'i\'m', 'is', 'get', '<a', 'want', 'another', 'could', '+', 'something', 'used', 'it\'s', 'like', '=', '-', '+', 'see', 'look', 'id', 'make', 'size', 'getting', 'let', 'height', 'maybe', 'know', 'thanks', 'fine', 'always', 'stored', 'us', 'bit', 'called', 'create', 'hand', 'menu'])
stop.extend( ['warn', 'main', 'localhost', 'http', 'startstop', 'bio', 'info', 'cluster', 'finest', 'chatapp', 'disabled', 'new', 'clientoutboundchannel', 'appenders', 'run', 'okdebug', 'monkey_', 'successful', 'finer', 'apr', 'edt', 'audit', 'jul', 'heartbeat', 'reconnection', 'uma', 'disconnected', 'ping', 'serv', 'jgroups', 'javawsapplicationmain'])
stop.extend( ['discovered', 'severe', 'remote', 'dc', 'maig', 'trace', 'wrappermanager', 'cest', 'connection', 'elasticsearch', 'maynard', 'roottracer', 'tiboldt', 'djavax', 'livebeansview', 'control', 'temporarily', 'completed', 'asyncio', 'bootstrap', 'cancelling', 'wthlnap', 'shutting', 'infowrapper', 'mchange', 'feb', 'cfpvednfsg'])
stop.extend( ['logged', 'appeventdao', 'datasourceutils', 'forwarding', 'elasticsearchmaynard', 'imyf', 'appeventdao', 'datasourceutils', 'forwarding', 'elasticsearchmaynard', 'imyf', 'connection[', 'unwanted', 'localvalue', 'exited', 'newsession', 'srvmain', 'gradle\\', 'lient', 'defuncting', 'org', 'errorexit', 'upclsch', 'permessage', 'httpsessionsecuritycontextrepository'] )
stop.extend( ['ulimit', '_auth', 'stdout', 'dwrapper', 'org'] )
stoplist = stemmer.stemWords(stop)
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist)))
stop.remove('how')
lmtzr = WordNetLemmatizer()
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
splitted = line.split();
wordd = []
for jj in splitted:
if ( len(jj)>3 ):
if( jj[0]=='[' ):
jj = jj.replace('[', "")
if( jj[ len( jj )-1 ]==']' ):
jj = jj.replace(']', "")
jj = jj.lower()
if( (jj not in stop) and len(jj)>2 ):
try:
oioi = lmtzr.lemmatize(jj)
except:
continue
word_without_unicode = oioi.encode('ascii', 'ignore')
if( word_without_unicode in mapp ):
word_without_unicode = mapp[word_without_unicode]
wordd.append( word_without_unicode )
#print wordd.split()
if( len( wordd )!=0 ):
yield wordd
sentences = MySentences('/home/user/Desktop/textll/') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)
model = Word2Vec(sentences, window=10) # default value is 5
model = Word2Vec(sentences, min_count=10) # default value is 5
model = Word2Vec(sentences, size=200) # default value is 100
model = Word2Vec(sentences, workers=4) # default = 1 worker = no parallelization
model.save('/home/user/Desktop/modd/mymodel')
print model.most_similar('suggest', topn=10)