[gensim] adapted code to handle HDP model from gensim along with lda models.

bloody76 · bloody76 · commit 70d79f88f3f4 · 2016-05-18T17:11:40.000+02:00
[requirements] added gensim packages to test-requirements.

[tests] added gensim tests to ensure prepare/save_html functions still works with lda and hdp models.
diff --git a/pyLDAvis/_prepare.py b/pyLDAvis/_prepare.py
@@ -31,7 +31,7 @@
 
 
 def __num_dist_rows__(array, ndigits=2):
-   return int(pd.DataFrame(array).sum(axis=1).map(lambda x: round(x, ndigits)).sum())
+   return array.shape[0] - int((pd.DataFrame(array).sum(axis=1) < 0.999).sum())
 
 
 class ValidationError(ValueError):
diff --git a/pyLDAvis/gensim.py b/pyLDAvis/gensim.py
@@ -33,14 +33,26 @@ def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
    assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(doc_lengths.shape[0], len(corpus))
 
    if doc_topic_dists is None:
-      gamma, _ = topic_model.inference(corpus)
+      # If its an HDP model.
+      if hasattr(topic_model, 'lda_beta'):
+          gamma = topic_model.inference(corpus)
+      else:
+          gamma, _ = topic_model.inference(corpus)
       doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
 
-   assert doc_topic_dists.shape[1] == topic_model.num_topics, 'Document topics and number of topics do not match {} != {}'.format(doc_topic_dists.shape[0], topic_model.num_topics)
+   if hasattr(topic_model, 'lda_alpha'):
+       num_topics = len(topic_model.lda_alpha)
+   else:
+       num_topics = topic_model.num_topics
+
+   assert doc_topic_dists.shape[1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(doc_topic_dists.shape[0], num_topics)
 
    # get the topic-term distribution straight from gensim without
    # iterating over tuples
-   topic = topic_model.state.get_lambda()
+   if hasattr(topic_model, 'lda_beta'):
+       topic = topic_model.lda_beta
+   else:
+       topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]
 
diff --git a/rtd_reqs.txt b/rtd_reqs.txt
@@ -2,3 +2,4 @@ jinja2==2.7.2
 numpydoc>=0.4
 pytest
 future
+gensim
diff --git a/setup.py b/setup.py
@@ -50,7 +50,8 @@ def run_tests(self):
 
 test_requirements = [
     'pytest',
-    'funcy'
+    'funcy',
+    'gensim'
 ]
 
 setup(
diff --git a/tests/pyLDAvis/test_gensim_models.py b/tests/pyLDAvis/test_gensim_models.py
@@ -0,0 +1,61 @@
+#! /usr/bin/venv python
+
+
+from gensim.models import LdaModel, HdpModel
+from gensim.corpora.dictionary import Dictionary
+import pyLDAvis.gensim
+import os
+
+
+def get_corpus_dictionary():
+    """Crafts a toy corpus and the dictionary associated."""
+    # Toy corpus.
+    corpus = [
+        ['carrot', 'salad', 'tomato'],
+        ['carrot', 'salad', 'dish'],
+        ['tomato', 'dish'],
+        ['tomato', 'salad'],
+
+        ['car', 'break', 'highway'],
+        ['highway', 'accident', 'car'],
+        ['moto', 'break'],
+        ['accident', 'moto', 'car']
+    ]
+
+    dictionary = Dictionary(corpus)
+
+    # Transforming corpus with dictionary.
+    corpus = [dictionary.doc2bow(doc) for doc in corpus]
+
+    # Building reverse index.
+    for (token, uid) in dictionary.token2id.items():
+        dictionary.id2token[uid] = token
+
+    return corpus, dictionary
+
+def test_lda():
+    """Trains a LDA model and tests the html outputs."""
+    corpus, dictionary = get_corpus_dictionary()
+
+    lda = LdaModel(corpus=corpus,
+                   num_topics=2)
+
+    data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
+    pyLDAvis.save_html(data, 'index_lda.html')
+    os.remove('index_lda.html')
+
+
+def test_hdp():
+    """Trains a HDP model and tests the html outputs."""
+    corpus, dictionary = get_corpus_dictionary()
+
+    hdp = HdpModel(corpus, dictionary.id2token)
+
+    data = pyLDAvis.gensim.prepare(hdp, corpus, dictionary)
+    pyLDAvis.save_html(data, 'index_hdp.html')
+    os.remove('index_hdp.html')
+
+
+if __name__ == "__main__":
+    test_lda()
+    test_hdp()
diff --git a/tests/pyLDAvis/test_prepare.py b/tests/pyLDAvis/test_prepare.py
@@ -74,8 +74,10 @@ def rounded_token_table(r):
        tt.Freq =  tt.Freq.round(5)
        return tt
     ett, ott = both(rounded_token_table)
-    joined = pd.merge(ott, ett, on=['Freq', 'Term'], suffixes=['_o','_e'], how='inner')
-    most_likely_map = pd.DataFrame(joined.groupby('Topic_o')['Topic_e'].value_counts(), columns=['count']).query('count > 100')
+    joined = pd.DataFrame(pd.merge(ott, ett, on=['Freq', 'Term'], suffixes=['_o','_e'], how='inner')\
+             .groupby('Topic_o')['Topic_e'].value_counts())
+    joined.columns = ['count']
+    most_likely_map = joined.query('count > 100')
     most_likely_map.index.names = ['Topic_o', 'Topic_e']
     df = pd.DataFrame(most_likely_map).reset_index()
     assert_array_equal(df['Topic_o'].values, df['Topic_e'].values)

-Original file line number
+Diff line change
 numpydoc>=0.4
 pytest
 future
 +gensim
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,8 @@ def run_tests(self):`
`50`	`50`
`51`	`51`	`test_requirements = [`
`52`	`52`	`'pytest',`
`53`		`- 'funcy'`
	`53`	`+ 'funcy',`
	`54`	`+ 'gensim'`
`54`	`55`	`]`
`55`	`56`
`56`	`57`	`setup(`