Skip to content

Commit

Permalink
Test that the tfidf parameter has desired effect on similarity_matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed Mar 17, 2018
1 parent 6dcb687 commit 609fce3
Showing 1 changed file with 28 additions and 2 deletions.
30 changes: 28 additions & 2 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import numpy as np

from gensim.corpora import Dictionary
from gensim.models import KeyedVectors as EuclideanKeyedVectors
from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel
from gensim.test.utils import datapath


Expand Down Expand Up @@ -56,7 +56,33 @@ def test_similarity_matrix(self):
similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense()
self.assertEquals(20, np.sum(similarity_matrix == 0))

# TODO: Add unit test to check that supplied tfidf has desired effect
# check that processing rows in the order given by IDF has desired effect

# The complete similarity matrix we would obtain with nonzero_limit would look as follows:
documents = [["honour", "understanding"], ["understanding", "mean", "knop"]]
dictionary = Dictionary(documents)
tfidf = TfidfModel(dictionary=dictionary)

# All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1.0.
# The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0.
#
# If we do not pass the tfidf parameter to the similarity_matrix
# method, then we process rows from 1 to 4. If we do pass the tfidf
# parameter to the similarity_matrix method, then we first process the
# rows 1, 3, 4 that correspond to terms with IDF of 1.0 and then row 2
# that corresponds to the term "understanding" with IDF of 0. Since the
# method is greedy, we end up with two different similarity matrices.

self.assertTrue(
np.isclose(
self.vectors.similarity_matrix(dictionary, nonzero_limit=2).todense(),
np.array([[1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])))
self.assertTrue(
np.isclose(
self.vectors.similarity_matrix(dictionary, tfidf, nonzero_limit=2).todense(),
np.array([
[1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0],
[0, 0.90007025, 1, 0], [0.9112908, 0, 0, 1]])))

def test_most_similar(self):
"""Test most_similar returns expected results."""
Expand Down

0 comments on commit 609fce3

Please sign in to comment.