Skip to content

Commit

Permalink
[WIP] Print methods in HDP (piskvorky#1055)
Browse files Browse the repository at this point in the history
* Added print methods, lda_model

* Added HDP tests

* Changelog

* Removed duplicate code

* Removed duplicate code

* Added import

* Fixed Changelog
  • Loading branch information
bhargavvader authored and jayantj committed Jan 4, 2017
1 parent f692046 commit 4f2474d
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 20 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Unreleased:

0.13.4, 2016-12-22

* Added suggested lda model method and print methods to HDP class (@bhargavvader, [#1055](https://github.com/RaRe-Technologies/gensim/pull/1055))
* New class KeyedVectors to store embedding separate from training code (@anmol01gulati and @droudy, [#980](https://github.com/RaRe-Technologies/gensim/pull/980))
* Evaluation of word2vec models against semantic similarity datasets like SimLex-999 (@akutuzov, [#1047](https://github.com/RaRe-Technologies/gensim/pull/1047))
* TensorBoard word embedding visualisation of Gensim Word2vec format (@loretoparisi, [#1051](https://github.com/RaRe-Technologies/gensim/pull/1051))
Expand Down
61 changes: 58 additions & 3 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import scipy.special as sp

from gensim import interfaces, utils, matutils
from gensim.models import basemodel
from gensim.models import basemodel, ldamodel
from six.moves import xrange

logger = logging.getLogger(__name__)
Expand All @@ -56,6 +56,7 @@ def dirichlet_expectation(alpha):
return(sp.psi(alpha) - sp.psi(np.sum(alpha, 1))[:, np.newaxis])



def expect_log_sticks(sticks):
"""
For stick-breaking hdp, return the E[log(sticks)]
Expand Down Expand Up @@ -130,7 +131,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
outputdir=None):
outputdir=None, random_state=None):
"""
`gamma`: first level concentration
`alpha`: second level concentration
Expand All @@ -151,6 +152,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
self.max_time = max_time
self.outputdir = outputdir

self.random_state = utils.get_random_state(random_state)

self.lda_alpha = None
self.lda_beta = None

Expand All @@ -169,7 +172,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
self.m_var_sticks[1] = range(T - 1, 0, -1)
self.m_varphi_ss = np.zeros(T)

self.m_lambda = np.random.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
self.m_eta = eta
self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)

Expand Down Expand Up @@ -442,6 +445,21 @@ def update_expectations(self):
self.m_timestamp[:] = self.m_updatect
self.m_status_up_to_date = True

def show_topic(self, topic_id, num_words=20, log=False, formatted=False):
"""
Print the `num_words` most probable words for `topics` number of topics.
Set `topics=-1` to print all topics.
Set `formatted=True` to return the topics as a list of strings, or
`False` as lists of (weight, word) pairs.
"""
if not self.m_status_up_to_date:
self.update_expectations()
betas = self.m_lambda + self.m_eta
hdp_formatter = HdpTopicFormatter(self.id2word, betas)
return hdp_formatter.show_topic(topic_id, num_words, log, formatted)

def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
"""
Print the `num_words` most probable words for `topics` number of topics.
Expand Down Expand Up @@ -510,6 +528,17 @@ def hdp_to_lda(self):

return (alpha, beta)

def suggested_lda_model(self):
"""
Returns closest corresponding ldamodel object corresponding to current hdp model.
The hdp_to_lda method only returns corresponding alpha, beta values, and this method returns a trained ldamodel.
The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
"""
alpha, beta = self.hdp_to_lda()
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
ldam.expElogbeta[:] = beta
return ldam

def evaluate_test_corpus(self, corpus):
logger.info('TEST: evaluating test corpus')
if self.lda_alpha is None or self.lda_beta is None:
Expand Down Expand Up @@ -589,6 +618,32 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

return shown

def print_topic(self, topic_id, num_words):
return self.show_topic(topic_id, num_words, formatted=True)

def show_topic(self, topic_id, num_words, log=False, formatted=False):

lambdak = list(self.data[topic_id, :])
lambdak = lambdak / sum(lambdak)

temp = zip(lambdak, xrange(len(lambdak)))
temp = sorted(temp, key=lambda x: x[0], reverse=True)

topic_terms = self.show_topic_terms(temp, num_words)

if formatted:
topic = self.format_topic(topic_id, topic_terms)

# assuming we only output formatted topics
if log:
logger.info(topic)
else:
topic = (topic_id, topic_terms)

# we only return the topic_terms
return topic[1]


def show_topic_terms(self, topic_data, num_words):
return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]]

Expand Down
15 changes: 1 addition & 14 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,6 @@ def update_dir_prior(prior, N, logphat, rho):

return prior

def get_random_state(seed):
""" Turn seed into a np.random.RandomState instance.
Method originally from maciejkula/glove-python, and written by @joshloyal
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a np.random.RandomState'
' instance' % seed)

class LdaState(utils.SaveLoad):
"""
Expand Down Expand Up @@ -314,7 +301,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,

self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

self.random_state = get_random_state(random_state)
self.random_state = utils.get_random_state(random_state)

assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
Expand Down
24 changes: 21 additions & 3 deletions gensim/test/test_hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from gensim import matutils
from gensim.test import basetests

import numpy as np

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
Expand Down Expand Up @@ -51,12 +52,29 @@ class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = hdpmodel.HdpModel
self.model = self.class_(corpus, id2word=dictionary)
self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0))

def testShowTopic(self):
# TODO create show_topic in HdpModel and then test
def testTopicValues(self):
"""
Check show topics method
"""
results = self.model.show_topics()[0]
expected_prob, expected_word = '0.264', 'trees '
prob, word = results[1].split('+')[0].split('*')
self.assertEqual(results[0], 0)
self.assertEqual(prob, expected_prob)
self.assertEqual(word, expected_word)

return

def testLDAmodel(self):
"""
Create ldamodel object, and check if the corresponding alphas are equal.
"""
ldam = self.model.suggested_lda_model()
self.assertEqual(ldam.alpha[0], self.model.lda_alpha[0])


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
16 changes: 16 additions & 0 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import subprocess

import numpy
import numbers
import scipy.sparse

if sys.version_info[0] >= 3:
Expand Down Expand Up @@ -80,6 +81,21 @@ def smart_open(fname, mode='rb'):
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)


def get_random_state(seed):
""" Turn seed into a np.random.RandomState instance.
Method originally from maciejkula/glove-python, and written by @joshloyal
"""
if seed is None or seed is numpy.random:
return numpy.random.mtrand._rand
if isinstance(seed, (numbers.Integral, numpy.integer)):
return numpy.random.RandomState(seed)
if isinstance(seed, numpy.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
' instance' % seed)


def synchronous(tlockname):
"""
A decorator to place an instance-based lock around a method.
Expand Down

0 comments on commit 4f2474d

Please sign in to comment.