From c0dea136acca0c7a5f279454cc12739df90dd9e2 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Sun, 25 Jun 2017 06:06:52 +0530 Subject: [PATCH 01/10] add flags for diagnol and annotation --- gensim/models/ldamodel.py | 59 ++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 9f41334d47..e6d56e040b 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -971,7 +971,7 @@ def get_term_topics(self, word_id, minimum_probability=None): return values - def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, normed=True): + def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, normed=True, diagonal=True, matrix=False, annotation=True): """ Calculate difference topic2topic between two Lda models `other` instances of `LdaMulticore` or `LdaModel` @@ -1016,26 +1016,53 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, if distance == "jaccard": d1, d2 = fst_topics, snd_topics - z = np.zeros((t1_size, t2_size)) - for topic1 in range(t1_size): - for topic2 in range(t2_size): - z[topic1][topic2] = distance_func(d1[topic1], d2[topic2]) + if matrix: + z = np.zeros((t1_size, t2_size)) - if normed: - if np.abs(np.max(z)) > 1e-8: - z /= np.max(z) + for topic1 in range(t1_size): + for topic2 in range(t2_size): + z[topic1][topic2] = distance_func(d1[topic1], d2[topic2]) - annotation = [[None] * t1_size for _ in range(t2_size)] + if normed: + if np.abs(np.max(z)) > 1e-8: + z /= np.max(z) - for topic1 in range(t1_size): - for topic2 in range(t2_size): - pos_tokens = fst_topics[topic1] & snd_topics[topic2] - neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2]) + if annotation: + annotation = [[None] * t1_size for _ in range(t2_size)] - pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) - neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) + for topic1 in range(t1_size): + for topic2 in range(t2_size): + pos_tokens = fst_topics[topic1] & snd_topics[topic2] + neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2]) - annotation[topic1][topic2] = [pos_tokens, neg_tokens] + pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) + neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) + print(pos_tokens) + + annotation[topic1][topic2] = [pos_tokens, neg_tokens] + + if diagonal: + assert t1_size == t2_size, 'mismatch between number of topics from both model' + z = np.zeros(t1_size) + + for topic in range(t1_size): + z[topic] = distance_func(d1[topic], d2[topic]) + + if normed: + if np.abs(np.max(z)) > 1e-8: + z /= np.max(z) + + if annotation: + annotation = [None] * t1_size + + for topic in range(t1_size): + pos_tokens = fst_topics[topic] & snd_topics[topic] + neg_tokens = fst_topics[topic].symmetric_difference(snd_topics[topic]) + + pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) + neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) + + annotation[topic] = [pos_tokens, neg_tokens] return z, annotation From 22875ef60435fc83c07a47892b4723a3993b142e Mon Sep 17 00:00:00 2001 From: parulsethi Date: Sun, 25 Jun 2017 23:44:06 +0530 Subject: [PATCH 02/10] make matrix default --- gensim/models/ldamodel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index e6d56e040b..e4481d689d 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -971,7 +971,7 @@ def get_term_topics(self, word_id, minimum_probability=None): return values - def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, normed=True, diagonal=True, matrix=False, annotation=True): + def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, normed=True, diagonal=False, matrix=True, annotation=True): """ Calculate difference topic2topic between two Lda models `other` instances of `LdaMulticore` or `LdaModel` @@ -1037,12 +1037,11 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) - print(pos_tokens) annotation[topic1][topic2] = [pos_tokens, neg_tokens] if diagonal: - assert t1_size == t2_size, 'mismatch between number of topics from both model' + assert t1_size == t2_size, 'mismatch between number of topics in both model' z = np.zeros(t1_size) for topic in range(t1_size): From 68eb54e5b6530ce6123179eb632617b22de00332 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Mon, 10 Jul 2017 04:21:25 +0530 Subject: [PATCH 03/10] remove duplication --- gensim/models/ldamodel.py | 76 +++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index e4481d689d..4d393fe16b 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -971,30 +971,34 @@ def get_term_topics(self, word_id, minimum_probability=None): return values - def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, normed=True, diagonal=False, matrix=True, annotation=True): + def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, normed=True, diagonal=False, annotation=True): """ Calculate difference topic2topic between two Lda models `other` instances of `LdaMulticore` or `LdaModel` `distance` is function that will be applied to calculate difference between any topic pair. - Available values: `kulback_leibler`, `hellinger` and `jaccard` + Available values: `kullback_leibler`, `hellinger` and `jaccard` `num_words` is quantity of most relevant words that used if distance == `jaccard` (also used for annotation) `n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation) Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None), - where + where: + annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and `int_k` is word from intersection of `topic_i` and `topic_j` and `diff_l` is word from symmetric difference of `topic_i` and `topic_j` - `normed` is a flag. If `true`, matrix Z will be normalized + `normed` is a flag. If `true`, matrix Z will be normalized + Example: + >>> m1, m2 = LdaMulticore.load(path_1), LdaMulticore.load(path_2) >>> mdiff, annotation = m1.diff(m2) >>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2` >>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2` + """ distances = { - "kulback_leibler": kullback_leibler, + "kullback_leibler": kullback_leibler, "hellinger": hellinger, "jaccard": jaccard_distance, } @@ -1016,54 +1020,38 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, if distance == "jaccard": d1, d2 = fst_topics, snd_topics - if matrix: + if diagonal: + t_size = min(t1_size, t2_size) + z = np.zeros(t_size) + if annotation: + diff_terms = np.zeros(t_size, dtype=list) + else: z = np.zeros((t1_size, t2_size)) - - for topic1 in range(t1_size): - for topic2 in range(t2_size): - z[topic1][topic2] = distance_func(d1[topic1], d2[topic2]) - - if normed: - if np.abs(np.max(z)) > 1e-8: - z /= np.max(z) - if annotation: - annotation = [[None] * t1_size for _ in range(t2_size)] - - for topic1 in range(t1_size): - for topic2 in range(t2_size): - pos_tokens = fst_topics[topic1] & snd_topics[topic2] - neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2]) - - pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) - neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) + diff_terms = np.zeros((t1_size, t2_size), dtype=list) - annotation[topic1][topic2] = [pos_tokens, neg_tokens] + for topic in np.ndindex(z.shape): + topic1 = topic[0] + if diagonal: + topic2 = topic1 + else: + topic2 = topic[1] - if diagonal: - assert t1_size == t2_size, 'mismatch between number of topics in both model' - z = np.zeros(t1_size) + z[topic] = distance_func(d1[topic1], d2[topic2]) + if annotation: + pos_tokens = fst_topics[topic1] & snd_topics[topic2] + neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2]) - for topic in range(t1_size): - z[topic] = distance_func(d1[topic], d2[topic]) + pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) + neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) - if normed: + diff_terms[topic] = [pos_tokens, neg_tokens] + + if normed: if np.abs(np.max(z)) > 1e-8: z /= np.max(z) - if annotation: - annotation = [None] * t1_size - - for topic in range(t1_size): - pos_tokens = fst_topics[topic] & snd_topics[topic] - neg_tokens = fst_topics[topic].symmetric_difference(snd_topics[topic]) - - pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) - neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) - - annotation[topic] = [pos_tokens, neg_tokens] - - return z, annotation + return z, diff_terms def __getitem__(self, bow, eps=None): """ From 076ae386fbea523e7502c3ecdaffd12c39338619 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Wed, 12 Jul 2017 20:18:10 +0530 Subject: [PATCH 04/10] raise error on diff no. of topics --- gensim/models/ldamodel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 4d393fe16b..48f1111b2a 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -1021,10 +1021,10 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 d1, d2 = fst_topics, snd_topics if diagonal: - t_size = min(t1_size, t2_size) - z = np.zeros(t_size) + assert t1_size == t2_size, "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix" + z = np.zeros(t1_size) if annotation: - diff_terms = np.zeros(t_size, dtype=list) + diff_terms = np.zeros(t1_size, dtype=list) else: z = np.zeros((t1_size, t2_size)) if annotation: From 4e7f3c784063b89b9d8aa5583577670b064a4bc1 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Wed, 12 Jul 2017 20:35:45 +0530 Subject: [PATCH 05/10] add docstrings --- gensim/models/ldamodel.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 48f1111b2a..f9f56ca8a0 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -971,7 +971,7 @@ def get_term_topics(self, word_id, minimum_probability=None): return values - def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, normed=True, diagonal=False, annotation=True): + def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, diagonal=False, annotation=True, normed=True): """ Calculate difference topic2topic between two Lda models `other` instances of `LdaMulticore` or `LdaModel` @@ -979,8 +979,10 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 Available values: `kullback_leibler`, `hellinger` and `jaccard` `num_words` is quantity of most relevant words that used if distance == `jaccard` (also used for annotation) `n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation) + `diagonal` set to True if the difference is required only between the identical topic no.s (returns diagonal of diff matrix) + `annotation` whether the intersection or difference of words between two topics should be returned Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j - and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None), + and matrix annotation (if True) with shape (m1.num_topics, m2.num_topics, 2, None), where: annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and @@ -1022,14 +1024,17 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 if diagonal: assert t1_size == t2_size, "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix" + # initialize z and annotation array z = np.zeros(t1_size) if annotation: diff_terms = np.zeros(t1_size, dtype=list) else: + # initialize z and annotation matrix z = np.zeros((t1_size, t2_size)) if annotation: diff_terms = np.zeros((t1_size, t2_size), dtype=list) + # iterate over each cell in the initialized z and annotation for topic in np.ndindex(z.shape): topic1 = topic[0] if diagonal: From 10f35f0dd30466386cc9fae15c0ab27dd8bc02c7 Mon Sep 17 00:00:00 2001 From: Parul Sethi Date: Thu, 13 Jul 2017 03:39:03 +0530 Subject: [PATCH 06/10] Fix flake8 --- gensim/models/ldamodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index f9f56ca8a0..96906e1bb9 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -980,7 +980,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 `num_words` is quantity of most relevant words that used if distance == `jaccard` (also used for annotation) `n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation) `diagonal` set to True if the difference is required only between the identical topic no.s (returns diagonal of diff matrix) - `annotation` whether the intersection or difference of words between two topics should be returned + `annotation` whether the intersection or difference of words between two topics should be returned Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j and matrix annotation (if True) with shape (m1.num_topics, m2.num_topics, 2, None), where: @@ -1051,7 +1051,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) diff_terms[topic] = [pos_tokens, neg_tokens] - + if normed: if np.abs(np.max(z)) > 1e-8: z /= np.max(z) From e52e4fb5cd6c315ef6f6dc438152ad440b6ac465 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Wed, 2 Aug 2017 20:12:09 +0530 Subject: [PATCH 07/10] rename annotation matrix variable --- gensim/models/ldamodel.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index f9f56ca8a0..18ee716981 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -1027,12 +1027,12 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 # initialize z and annotation array z = np.zeros(t1_size) if annotation: - diff_terms = np.zeros(t1_size, dtype=list) + annotation_terms = np.zeros(t1_size, dtype=list) else: # initialize z and annotation matrix z = np.zeros((t1_size, t2_size)) if annotation: - diff_terms = np.zeros((t1_size, t2_size), dtype=list) + annotation_terms = np.zeros((t1_size, t2_size), dtype=list) # iterate over each cell in the initialized z and annotation for topic in np.ndindex(z.shape): @@ -1050,13 +1050,13 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) - diff_terms[topic] = [pos_tokens, neg_tokens] + annotation_terms[topic] = [pos_tokens, neg_tokens] if normed: if np.abs(np.max(z)) > 1e-8: z /= np.max(z) - return z, diff_terms + return z, annotation_terms def __getitem__(self, bow, eps=None): """ From 31731ea407695ff9cef2c5e9b432ced942878334 Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 3 Aug 2017 00:11:33 +0530 Subject: [PATCH 08/10] add tests --- gensim/models/ldamodel.py | 1 + gensim/test/test_tmdiff.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 18ee716981..fc6553351a 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -1015,6 +1015,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 distance_func = distances[distance] d1, d2 = self.state.get_lambda(), other.state.get_lambda() t1_size, t2_size = d1.shape[0], d2.shape[0] + annotation_terms = None fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)] snd_topics = [{w for (w, _) in other.show_topic(topic, topn=num_words)} for topic in xrange(t2_size)] diff --git a/gensim/test/test_tmdiff.py b/gensim/test/test_tmdiff.py index 5ab0c0fac7..2632e8861a 100644 --- a/gensim/test/test_tmdiff.py +++ b/gensim/test/test_tmdiff.py @@ -4,6 +4,7 @@ # Copyright (C) 2016 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +import logging import unittest import numpy as np @@ -31,14 +32,22 @@ def setUp(self): self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10) def testBasic(self): + # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms) self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics)) self.assertEquals(len(annotation), self.num_topics) self.assertEquals(len(annotation[0]), self.num_topics) + # test for diagonal case + mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True) + + self.assertEqual(mdiff.shape, (self.num_topics,)) + self.assertEquals(len(annotation), self.num_topics) + def testIdentity(self): for dist_name in ["hellinger", "kullback_leibler", "jaccard"]: + # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name) for row in annotation: @@ -51,6 +60,24 @@ def testIdentity(self): if dist_name == "jaccard": self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) + # test for diagonal case + mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True) + + for (int_tokens, diff_tokens) in annotation: + self.assertEquals(diff_tokens, []) + self.assertEquals(len(int_tokens), self.n_ann_terms) + + self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) + + if dist_name == "jaccard": + self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) + def testInput(self): self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something') self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something') + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() + \ No newline at end of file From 4aa837861d1f3f2ec4146eddea422309ac9284bc Mon Sep 17 00:00:00 2001 From: parulsethi Date: Thu, 3 Aug 2017 00:53:48 +0530 Subject: [PATCH 09/10] fix indent --- gensim/models/ldamodel.py | 6 +++--- gensim/test/test_tmdiff.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 53745419b0..7b137284cd 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -1052,10 +1052,10 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) annotation_terms[topic] = [pos_tokens, neg_tokens] - + if normed: - if np.abs(np.max(z)) > 1e-8: - z /= np.max(z) + if np.abs(np.max(z)) > 1e-8: + z /= np.max(z) return z, annotation_terms diff --git a/gensim/test/test_tmdiff.py b/gensim/test/test_tmdiff.py index 2632e8861a..8779096337 100644 --- a/gensim/test/test_tmdiff.py +++ b/gensim/test/test_tmdiff.py @@ -79,5 +79,4 @@ def testInput(self): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() - \ No newline at end of file + unittest.main() \ No newline at end of file From 8ef695e4cfa2cd99ae924ef0a4372a072b657c8b Mon Sep 17 00:00:00 2001 From: Parul Sethi Date: Thu, 3 Aug 2017 01:28:07 +0530 Subject: [PATCH 10/10] flake8 fixes --- gensim/test/test_tmdiff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_tmdiff.py b/gensim/test/test_tmdiff.py index 8779096337..d6b60e8721 100644 --- a/gensim/test/test_tmdiff.py +++ b/gensim/test/test_tmdiff.py @@ -79,4 +79,4 @@ def testInput(self): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() \ No newline at end of file + unittest.main()