Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] HDP #1055

Merged
merged 9 commits into from
Dec 27, 2016
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 71 additions & 3 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import scipy.special as sp

from gensim import interfaces, utils, matutils
from gensim.models import basemodel
from gensim.models import basemodel, ldamodel
from six.moves import xrange

logger = logging.getLogger(__name__)
Expand All @@ -56,6 +56,21 @@ def dirichlet_expectation(alpha):
return(sp.psi(alpha) - sp.psi(np.sum(alpha, 1))[:, np.newaxis])


def get_random_state(seed):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why copy-paste from LdaModel? Should it be moved to utils?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll move the common ldamodel and hdpmodel methods to the respective utils and matutils files after this PR is merged.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to make a few other changes to utils and matutils as well

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be done as a part of this PR. Duplicate code will not be merged.

""" Turn seed into a np.random.RandomState instance.

Method originally from maciejkula/glove-python, and written by @joshloyal
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a np.random.RandomState'
' instance' % seed)


def expect_log_sticks(sticks):
"""
For stick-breaking hdp, return the E[log(sticks)]
Expand Down Expand Up @@ -130,7 +145,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
outputdir=None):
outputdir=None, random_state=None):
"""
`gamma`: first level concentration
`alpha`: second level concentration
Expand All @@ -151,6 +166,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
self.max_time = max_time
self.outputdir = outputdir

self.random_state = get_random_state(random_state)

self.lda_alpha = None
self.lda_beta = None

Expand All @@ -169,7 +186,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
self.m_var_sticks[1] = range(T - 1, 0, -1)
self.m_varphi_ss = np.zeros(T)

self.m_lambda = np.random.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
self.m_eta = eta
self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)

Expand Down Expand Up @@ -442,6 +459,21 @@ def update_expectations(self):
self.m_timestamp[:] = self.m_updatect
self.m_status_up_to_date = True

def show_topic(self, topic_id, num_words=20, log=False, formatted=False):
"""
Print the `num_words` most probable words for `topics` number of topics.
Set `topics=-1` to print all topics.

Set `formatted=True` to return the topics as a list of strings, or
`False` as lists of (weight, word) pairs.

"""
if not self.m_status_up_to_date:
self.update_expectations()
betas = self.m_lambda + self.m_eta
hdp_formatter = HdpTopicFormatter(self.id2word, betas)
return hdp_formatter.show_topic(topic_id, num_words, log, formatted)

def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
"""
Print the `num_words` most probable words for `topics` number of topics.
Expand Down Expand Up @@ -510,6 +542,16 @@ def hdp_to_lda(self):

return (alpha, beta)

def suggested_lda_model(self):
"""
Returns closest corresponding ldamodel object corresponding to current hdp model.
The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is it different from hdp_to_lda? Add a comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed duplicate code, added comment.

"""
alpha, beta = self.hdp_to_lda()
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
ldam.expElogbeta[:] = beta
return ldam

def evaluate_test_corpus(self, corpus):
logger.info('TEST: evaluating test corpus')
if self.lda_alpha is None or self.lda_beta is None:
Expand Down Expand Up @@ -589,6 +631,32 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

return shown

def print_topic(self, topic_id, num_words):
return self.show_topic(topic_id, num_words, formatted=True)

def show_topic(self, topic_id, num_words, log=False, formatted=False):

lambdak = list(self.data[topic_id, :])
lambdak = lambdak / sum(lambdak)

temp = zip(lambdak, xrange(len(lambdak)))
temp = sorted(temp, key=lambda x: x[0], reverse=True)

topic_terms = self.show_topic_terms(temp, num_words)

if formatted:
topic = self.format_topic(topic_id, topic_terms)

# assuming we only output formatted topics
if log:
logger.info(topic)
else:
topic = (topic_id, topic_terms)

# we only return the topic_terms
return topic[1]


def show_topic_terms(self, topic_data, num_words):
return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]]

Expand Down
24 changes: 21 additions & 3 deletions gensim/test/test_hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from gensim import matutils
from gensim.test import basetests

import numpy as np

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
Expand Down Expand Up @@ -51,12 +52,29 @@ class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = hdpmodel.HdpModel
self.model = self.class_(corpus, id2word=dictionary)
self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0))

def testShowTopic(self):
# TODO create show_topic in HdpModel and then test
def testTopicValues(self):
"""
Check show topics method
"""
results = self.model.show_topics()[0]
expected_prob, expected_word = '0.264', 'trees '
prob, word = results[1].split('+')[0].split('*')
self.assertEqual(results[0], 0)
self.assertEqual(prob, expected_prob)
self.assertEqual(word, expected_word)

return

def testLDAmodel(self):
"""
Create ldamodel object, and check if the corresponding alphas are equal.
"""
ldam = self.model.suggested_lda_model()
self.assertEqual(ldam.alpha[0], self.model.lda_alpha[0])


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()