diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 674689afce..a1f1d22df8 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -946,7 +946,9 @@ def nbow(document): # Compute WMD. return emd(d1, d2, distance_matrix) - def most_similar_cosmul(self, positive=None, negative=None, topn=10): + def most_similar_cosmul( + self, positive=None, negative=None, topn=10, restrict_vocab=None + ): """Find the top-N most similar words, using the multiplicative combination objective, proposed by `Omer Levy and Yoav Goldberg "Linguistic Regularities in Sparse and Explicit Word Representations" `_. Positive words still contribute positively towards the similarity, @@ -959,6 +961,9 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): With a single positive example, rankings will be the same as in the default :meth:`~gensim.models.keyedvectors.KeyedVectors.most_similar`. + Allows calls like most_similar_cosmul('dog', 'cat'), as a shorthand for + most_similar_cosmul(['dog'], ['cat']) where 'dog' is positive and 'cat' negative + Parameters ---------- positive : list of str, optional @@ -968,6 +973,11 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): topn : int or None, optional Number of top-N similar words to return, when `topn` is int. When `topn` is None, then similarities for all words are returned. + restrict_vocab : int or None, optional + Optional integer which limits the range of vectors which are searched for most-similar values. + For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order. + This may be meaningful if vocabulary is sorted by descending frequency. + Returns ------- @@ -985,7 +995,14 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): positive = _ensure_list(positive) negative = _ensure_list(negative) - self.fill_norms() + self.init_sims() + + if isinstance(positive, str): + # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) + positive = [positive] + + if isinstance(negative, str): + negative = [negative] all_words = { self.get_index(word) for word in positive + negative @@ -1205,7 +1222,9 @@ def _log_evaluate_word_analogies(section): logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) return score - def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + def evaluate_word_analogies( + self, analogies, restrict_vocab=300000, case_insensitive=True, + dummy4unknown=False, similarity_function='most_similar'): """Compute performance of the model on an analogy test set. The accuracy is reported (printed to log and returned as a score) for each section separately, @@ -1231,6 +1250,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi dummy4unknown : bool, optional If True - produce zero accuracies for 4-tuples with out-of-vocabulary words. Otherwise, these tuples are skipped entirely and not used in the evaluation. + similarity_function : str, optional + Function name used for similarity calculation. Returns ------- @@ -1286,6 +1307,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi predicted = None # find the most likely prediction using 3CosAdd (vector offset) method # TODO: implement 3CosMul and set-based methods for solving analogies + sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) self.key_to_index = original_key_to_index for element in sims: diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 2ff7995e0c..ecc44a30e4 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -373,6 +373,9 @@ def test_most_similar_cosmul(self): self.assertEqual( self.test_model.wv.most_similar_cosmul('nights'), self.test_model.wv.most_similar_cosmul(positive=['nights'])) + self.assertEqual( + self.test_model.wv.most_similar_cosmul('the', 'and'), + self.test_model.wv.most_similar_cosmul(positive=['the'], negative=['and'])) def test_lookup(self): # In vocab, sanity check diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 56a1ecfae0..8edfe3c04c 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -555,6 +555,12 @@ def test_evaluate_word_analogies(self): """Test that evaluating analogies on KeyedVectors give sane results""" model = word2vec.Word2Vec(LeeCorpus()) score, sections = model.wv.evaluate_word_analogies(datapath('questions-words.txt')) + score_cosmul, sections_cosmul = model.wv.evaluate_word_analogies( + datapath('questions-words.txt'), + similarity_function='most_similar_cosmul' + ) + self.assertEqual(score, score_cosmul) + self.assertEqual(sections, sections_cosmul) self.assertGreaterEqual(score, 0.0) self.assertLessEqual(score, 1.0) self.assertGreater(len(sections), 0)