first commit of training code and examples of inference.

t1t0n · Aug 18, 2019 · 46164b5 · 46164b5
commit 46164b5
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+*.model
+.idea/
+*.npy
+*.txt
+*.bz2
diff --git a/test_similar_words.py b/test_similar_words.py
@@ -0,0 +1,7 @@
+from gensim.models.word2vec import Word2Vec
+
+model = Word2Vec.load("georgian_word2vec.model")
+
+newton = model.most_similar_cosmul(positive='ნიუტონი'.split(), topn=5,)
+for ii, (word, score) in enumerate(newton):
+    print("{}. {} ({:1.2f})".format(ii+1, word, score))
diff --git a/test_word_vector.py b/test_word_vector.py
@@ -0,0 +1,4 @@
+from gensim.models.word2vec import Word2Vec
+
+model = Word2Vec.load("georgian_word2vec.model")
+print(model['ნიუტონი'])
diff --git a/train.py b/train.py
@@ -0,0 +1,21 @@
+from gensim.corpora.wikicorpus import WikiCorpus
+from gensim.models.word2vec import Word2Vec
+import multiprocessing
+import logging
+
+# enable logging
+logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
+
+# load Wiki dump file
+wiki = WikiCorpus('kawiki-latest-pages-articles.xml.bz2',
+                  lemmatize=False, dictionary={})
+sentences = list(wiki.get_texts())
+
+# define training parameters
+params = {'size': 200, 'window': 10, 'min_count': 10,
+          'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3, 
+          'iter': 5, 'sg':1, 'hs':1}
+
+# train and save word2vec model
+word2vec = Word2Vec(sentences, **params)
+word2vec.save("georgian_word2vec.model")
-Original file line number
+Diff line change
@@ -0,0 +1,5 @@
+    *.model
+    .idea/
+    *.npy
+    *.txt
+    *.bz2