diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 702cc6a468..7e1ab6a9c1 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1385,6 +1385,36 @@ def init_sims(self, replace=False): logger.info("precomputing L2-norms of word weight vectors") self.vectors_norm = _l2_norm(self.vectors, replace=replace) + def relative_cosine_similarity(self, wa, wb, topn=10): + """Compute the relative cosine similarity between two words given top-n similar words, + by `Artuur Leeuwenberga, Mihaela Velab , Jon Dehdaribc, Josef van Genabithbc "A Minimally Supervised Approach + for Synonym Extraction with Word Embeddings" `_. + + To calculate relative cosine similarity between two words, equation (1) of the paper is used. + For WordNet synonyms, if rcs(topn=10) is greater than 0.10 then wa and wb are more similar than + any arbitrary word pairs. + + Parameters + ---------- + wa: str + Word for which we have to look top-n similar word. + wb: str + Word for which we evaluating relative cosine similarity with wa. + topn: int, optional + Number of top-n similar words to look with respect to wa. + + Returns + ------- + numpy.float64 + Relative cosine similarity between wa and wb. + + """ + sims = self.similar_by_word(wa, topn) + assert sims, "Failed code invariant: list of similar words must never be empty." + rcs = float(self.similarity(wa, wb)) / (sum(sim for _, sim in sims)) + + return rcs + class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors): """Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model. diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index fc15dcd871..364a95ae35 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -106,6 +106,29 @@ def test_most_similar_topn(self): predicted = self.vectors.most_similar('war', topn=None) self.assertEqual(len(predicted), len(self.vectors.vocab)) + def test_relative_cosine_similarity(self): + """Test relative_cosine_similarity returns expected results with an input of a word pair and topn""" + wordnet_syn = [ + 'good', 'goodness', 'commodity', 'trade_good', 'full', 'estimable', 'honorable', + 'respectable', 'beneficial', 'just', 'upright', 'adept', 'expert', 'practiced', 'proficient', + 'skillful', 'skilful', 'dear', 'near', 'dependable', 'safe', 'secure', 'right', 'ripe', 'well', + 'effective', 'in_effect', 'in_force', 'serious', 'sound', 'salutary', 'honest', 'undecomposed', + 'unspoiled', 'unspoilt', 'thoroughly', 'soundly' + ] # synonyms for "good" as per wordnet + cos_sim = [] + for i in range(len(wordnet_syn)): + if wordnet_syn[i] in self.vectors.vocab: + cos_sim.append(self.vectors.similarity("good", wordnet_syn[i])) + cos_sim = sorted(cos_sim, reverse=True) # cosine_similarity of "good" with wordnet_syn in decreasing order + # computing relative_cosine_similarity of two similar words + rcs_wordnet = self.vectors.similarity("good", "nice") / sum(cos_sim[i] for i in range(10)) + rcs = self.vectors.relative_cosine_similarity("good", "nice", 10) + self.assertTrue(rcs_wordnet >= rcs) + self.assertTrue(np.allclose(rcs_wordnet, rcs, 0, 0.125)) + # computing relative_cosine_similarity for two non-similar words + rcs = self.vectors.relative_cosine_similarity("good", "worst", 10) + self.assertTrue(rcs < 0.10) + def test_most_similar_raises_keyerror(self): """Test most_similar raises KeyError when input is out of vocab.""" with self.assertRaises(KeyError):