Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flag parameters for topic diff #1448

Merged
merged 13 commits into from
Aug 3, 2017
48 changes: 32 additions & 16 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,16 +971,18 @@ def get_term_topics(self, word_id, minimum_probability=None):

return values

def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, normed=True):
def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, diagonal=False, annotation=True, normed=True):
"""
Calculate difference topic2topic between two Lda models
`other` instances of `LdaMulticore` or `LdaModel`
`distance` is function that will be applied to calculate difference between any topic pair.
Available values: `kullback_leibler`, `hellinger` and `jaccard`
`num_words` is quantity of most relevant words that used if distance == `jaccard` (also used for annotation)
`n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation)
`diagonal` set to True if the difference is required only between the identical topic no.s (returns diagonal of diff matrix)
`annotation` whether the intersection or difference of words between two topics should be returned
Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j
and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None),
and matrix annotation (if True) with shape (m1.num_topics, m2.num_topics, 2, None),
where:

annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and
Expand Down Expand Up @@ -1013,35 +1015,49 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10
distance_func = distances[distance]
d1, d2 = self.state.get_lambda(), other.state.get_lambda()
t1_size, t2_size = d1.shape[0], d2.shape[0]
annotation_terms = None

fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)]
snd_topics = [{w for (w, _) in other.show_topic(topic, topn=num_words)} for topic in xrange(t2_size)]

if distance == "jaccard":
d1, d2 = fst_topics, snd_topics

z = np.zeros((t1_size, t2_size))
for topic1 in range(t1_size):
for topic2 in range(t2_size):
z[topic1][topic2] = distance_func(d1[topic1], d2[topic2])

if normed:
if np.abs(np.max(z)) > 1e-8:
z /= np.max(z)

annotation = [[None] * t1_size for _ in range(t2_size)]
if diagonal:
assert t1_size == t2_size, "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix"
# initialize z and annotation array
z = np.zeros(t1_size)
if annotation:
annotation_terms = np.zeros(t1_size, dtype=list)
else:
# initialize z and annotation matrix
z = np.zeros((t1_size, t2_size))
if annotation:
annotation_terms = np.zeros((t1_size, t2_size), dtype=list)

# iterate over each cell in the initialized z and annotation
for topic in np.ndindex(z.shape):
topic1 = topic[0]
if diagonal:
topic2 = topic1
else:
topic2 = topic[1]

for topic1 in range(t1_size):
for topic2 in range(t2_size):
z[topic] = distance_func(d1[topic1], d2[topic2])
if annotation:
pos_tokens = fst_topics[topic1] & snd_topics[topic2]
neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2])

pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I remember, you want to remove sample for another PR, you already do it in different PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, it's in #1484

neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms))

annotation[topic1][topic2] = [pos_tokens, neg_tokens]
annotation_terms[topic] = [pos_tokens, neg_tokens]

if normed:
if np.abs(np.max(z)) > 1e-8:
z /= np.max(z)

return z, annotation
return z, annotation_terms

def __getitem__(self, bow, eps=None):
"""
Expand Down
26 changes: 26 additions & 0 deletions gensim/test/test_tmdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

import logging
import unittest
import numpy as np

Expand Down Expand Up @@ -31,14 +32,22 @@ def setUp(self):
self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10)

def testBasic(self):
# test for matrix case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms)

self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
self.assertEquals(len(annotation), self.num_topics)
self.assertEquals(len(annotation[0]), self.num_topics)

# test for diagonal case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True)

self.assertEqual(mdiff.shape, (self.num_topics,))
self.assertEquals(len(annotation), self.num_topics)

def testIdentity(self):
for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
# test for matrix case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name)

for row in annotation:
Expand All @@ -51,6 +60,23 @@ def testIdentity(self):
if dist_name == "jaccard":
self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

# test for diagonal case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True)

for (int_tokens, diff_tokens) in annotation:
self.assertEquals(diff_tokens, [])
self.assertEquals(len(int_tokens), self.n_ann_terms)

self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

if dist_name == "jaccard":
self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

def testInput(self):
self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something')
self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()