Skip to content

Commit

Permalink
Add KeyedVectors.relative_cosine_similarity (#2307)
Browse files Browse the repository at this point in the history
* Added Function relative_cosine_similarity

* Updated function relative_cosine_similarity

* Updated Function relative_cosine_similarity

* Added test for relative_cosine_similarity

* Added unit test for relative_cosine_similarity

* Update keyedvectors.py

* Updated test_relative_cosine_similarity

* updated function relative_cosine_similarity

* Updated relative_cosine_similarity

* Update keyedvectors.py

* Update keyedvectors.py

* Update keyedvectors.py

* Update test_keyedvectors.py

* Added link properly

* Update keyedvectors.py
  • Loading branch information
rsdel2007 authored and menshikh-iv committed Jan 15, 2019
1 parent f3cf463 commit a864e02
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 0 deletions.
30 changes: 30 additions & 0 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,36 @@ def init_sims(self, replace=False):
logger.info("precomputing L2-norms of word weight vectors")
self.vectors_norm = _l2_norm(self.vectors, replace=replace)

def relative_cosine_similarity(self, wa, wb, topn=10):
"""Compute the relative cosine similarity between two words given top-n similar words,
by `Artuur Leeuwenberga, Mihaela Velab , Jon Dehdaribc, Josef van Genabithbc "A Minimally Supervised Approach
for Synonym Extraction with Word Embeddings" <https://ufal.mff.cuni.cz/pbml/105/art-leeuwenberg-et-al.pdf>`_.
To calculate relative cosine similarity between two words, equation (1) of the paper is used.
For WordNet synonyms, if rcs(topn=10) is greater than 0.10 then wa and wb are more similar than
any arbitrary word pairs.
Parameters
----------
wa: str
Word for which we have to look top-n similar word.
wb: str
Word for which we evaluating relative cosine similarity with wa.
topn: int, optional
Number of top-n similar words to look with respect to wa.
Returns
-------
numpy.float64
Relative cosine similarity between wa and wb.
"""
sims = self.similar_by_word(wa, topn)
assert sims, "Failed code invariant: list of similar words must never be empty."
rcs = float(self.similarity(wa, wb)) / (sum(sim for _, sim in sims))

return rcs


class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
"""
Expand Down
23 changes: 23 additions & 0 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,29 @@ def test_most_similar_topn(self):
predicted = self.vectors.most_similar('war', topn=None)
self.assertEqual(len(predicted), len(self.vectors.vocab))

def test_relative_cosine_similarity(self):
"""Test relative_cosine_similarity returns expected results with an input of a word pair and topn"""
wordnet_syn = [
'good', 'goodness', 'commodity', 'trade_good', 'full', 'estimable', 'honorable',
'respectable', 'beneficial', 'just', 'upright', 'adept', 'expert', 'practiced', 'proficient',
'skillful', 'skilful', 'dear', 'near', 'dependable', 'safe', 'secure', 'right', 'ripe', 'well',
'effective', 'in_effect', 'in_force', 'serious', 'sound', 'salutary', 'honest', 'undecomposed',
'unspoiled', 'unspoilt', 'thoroughly', 'soundly'
] # synonyms for "good" as per wordnet
cos_sim = []
for i in range(len(wordnet_syn)):
if wordnet_syn[i] in self.vectors.vocab:
cos_sim.append(self.vectors.similarity("good", wordnet_syn[i]))
cos_sim = sorted(cos_sim, reverse=True) # cosine_similarity of "good" with wordnet_syn in decreasing order
# computing relative_cosine_similarity of two similar words
rcs_wordnet = self.vectors.similarity("good", "nice") / sum(cos_sim[i] for i in range(10))
rcs = self.vectors.relative_cosine_similarity("good", "nice", 10)
self.assertTrue(rcs_wordnet >= rcs)
self.assertTrue(np.allclose(rcs_wordnet, rcs, 0, 0.125))
# computing relative_cosine_similarity for two non-similar words
rcs = self.vectors.relative_cosine_similarity("good", "worst", 10)
self.assertTrue(rcs < 0.10)

def test_most_similar_raises_keyerror(self):
"""Test most_similar raises KeyError when input is out of vocab."""
with self.assertRaises(KeyError):
Expand Down

0 comments on commit a864e02

Please sign in to comment.