Skip to content

Commit

Permalink
first commit of training code and examples of inference.
Browse files Browse the repository at this point in the history
  • Loading branch information
t1t0n committed Aug 18, 2019
0 parents commit 46164b5
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.model
.idea/
*.npy
*.txt
*.bz2
7 changes: 7 additions & 0 deletions test_similar_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load("georgian_word2vec.model")

newton = model.most_similar_cosmul(positive='ნიუტონი'.split(), topn=5,)
for ii, (word, score) in enumerate(newton):
print("{}. {} ({:1.2f})".format(ii+1, word, score))
4 changes: 4 additions & 0 deletions test_word_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load("georgian_word2vec.model")
print(model['ნიუტონი'])
21 changes: 21 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
import multiprocessing
import logging

# enable logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# load Wiki dump file
wiki = WikiCorpus('kawiki-latest-pages-articles.xml.bz2',
lemmatize=False, dictionary={})
sentences = list(wiki.get_texts())

# define training parameters
params = {'size': 200, 'window': 10, 'min_count': 10,
'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3,
'iter': 5, 'sg':1, 'hs':1}

# train and save word2vec model
word2vec = Word2Vec(sentences, **params)
word2vec.save("georgian_word2vec.model")

0 comments on commit 46164b5

Please sign in to comment.