roshan-research · imani · Nov 28, 2022 · Nov 26, 2022 · Nov 26, 2022 · Nov 26, 2022
diff --git a/data.py b/data.py
@@ -266,7 +266,8 @@ def __iter__(self):
             yield TaggedDocument(word_tokenize(normalizer.normalize(list_of_words)), [i])
 
 
-def train_sentence_embedding(dataset_path, model_file='sent_embedding.model',min_count=5, workers=multiprocessing.cpu_count()-1, windows=5, vector_size=100, epochs=10, return_model=False):
+def train_sentence_embedding(dataset_path, model_file='sent_embedding.model',min_count=5, workers=multiprocessing.cpu_count()-1, windows=5, vector_size=300, epochs=10, return_model=False):
+	workers = 1 if workers == 0 else workers
 	doc = SentenceEmbeddingCorpus(dataset_path)
 	model = Doc2Vec(min_count=min_count,
          window=windows,
@@ -294,7 +295,8 @@ def __iter__(self):
             yield simple_preprocess(normalizer.normalize(line))
 
 
-def train_word_embedding(dataset_path, dest_path='word_embedding.model',min_count=5, workers=multiprocessing.cpu_count()-1, windows=5, vector_size=100, epochs=10, return_model=False):
+def train_word_embedding(dataset_path, dest_path='word_embedding.model',min_count=5, workers=multiprocessing.cpu_count()-1, windows=5, vector_size=200, epochs=10, return_model=False):
+	workers = 1 if workers == 0 else workers
 	doc = WordEmbeddingCorpus(dataset_path)
 	model = FastText(min_count=min_count,
          window=windows,

diff --git a/hazm/embedding.py b/hazm/embedding.py
@@ -1,4 +1,4 @@
-from . import word_tokenize
+from hazm import word_tokenize
 from gensim.models import KeyedVectors, Doc2Vec, fasttext
 from gensim.scripts.glove2word2vec import glove2word2vec
 import os
@@ -58,24 +58,24 @@ def __getitem__(self, word):
         return self.model[word]
 
 
-    def doesnt_match(self, txt):
+    def doesnt_match(self, words):
         '''.کلمه‌ نامرتبط را پیدا می‌کند
 
         Examples:
             >>> wordEmbedding = WordEmbedding(model_type = 'model_type', model_path = 'resources/cc.fa.300.bin')
-            >>> wordEmbedding.doesnt_match('سلام درود خداحافظ پنجره')
+            >>> wordEmbedding.doesnt_match(['سلام' ,'درود' ,'خداحافظ' ,'پنجره'])
             'پنجره'
 
         Args:
-            txt (str): متنی شامل کلمات 
+            words (list[str]): لیستی از کلمات مورد نظر
 
 		Returns:
-			(str): کلمه نامرتبط با سایر کلمات در متن
+			(str): کلمه نامرتبط با سایر کلمات در لیست
         '''
 
         if not self.model:
             raise AttributeError('Model must not be None! Please load model first.')
-        return self.model.doesnt_match(word_tokenize(txt))
+        return self.model.doesnt_match(words)
 
 
     def similarity(self, word1, word2):
@@ -94,12 +94,12 @@ def similarity(self, word1, word2):
             word2 (str): کلمه دوم
 
         Returns:
-            (numpy.float32): میزان شباهت دو کلمه
+            (float): میزان شباهت دو کلمه
         '''
 
         if not self.model:
             raise AttributeError('Model must not be None! Please load model first.')
-        return self.model.similarity(word1, word2)
+        return float(str(self.model.similarity(word1, word2)))
 
 
     def get_vocab(self):
@@ -120,7 +120,7 @@ def get_vocab(self):
 
 
 
-    def nearest_words(self, word, topn):
+    def nearest_words(self, word, topn=5):
         '''.مرتبط‌ترین کلمات را با کلمه ورودی گزارش می‌دهد
 
         Examples:
@@ -234,12 +234,15 @@ def similarity(self, sent1, sent2):
                 sent2 (str): جمله دوم
 
             Returns:
-                (numpy.float32): میزان شباهت دو جمله
+                (float): میزان شباهت دو جمله
             '''
 
             if not self.model:
                 raise AttributeError('Model must not be None! Please load model first.')
             else:
                 tokenized_sent1 = word_tokenize(sent1)
                 tokenized_sent2 = word_tokenize(sent2)
-                return self.model.similarity_unseen_docs(tokenized_sent1, tokenized_sent2)
+                return float(str(self.model.similarity_unseen_docs(tokenized_sent1, tokenized_sent2)))
+
+
+