Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Easy pyLDAvis visualisation and coherence for DTM python, wrapper. #829

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
3) Heavy lifting going on in the sslm class - efforts can be made to cythonise mathematical methods.
- in particular, update_obs and the optimization takes a lot time.
4) Try and make it distributed, especially around the E and M step.
5) Remove all C/C++ coding style/syntax.

"""

Expand Down Expand Up @@ -290,6 +291,7 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost,

time_slice = numpy.cumsum(numpy.array(self.time_slice))

# TODO: use chunks similar to ldamodel for constant memory footprint.
for line_no, line in enumerate(seq_corpus.corpus):
# this is used to update the time_slice and create a new lda_seq slice every new time_slice
if doc_index > time_slice[time]:
Expand Down Expand Up @@ -379,7 +381,7 @@ def print_topic(self, topic, time=0, top_terms=20):
topic = numpy.exp(topic[time])
topic = topic / topic.sum()
bestn = matutils.argsort(topic, top_terms, reverse=True)
beststr = [(round(topic[id_], 3), self.corpus.id2word[id_]) for id_ in bestn]
beststr = [(self.corpus.id2word[id_], round(topic[id_], 3)) for id_ in bestn]
return beststr


Expand All @@ -392,6 +394,48 @@ def doc_topics(self, doc_number):
doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis]
return doc_topic[doc_number]

def DTMvis(self, time):
"""
returns term_frequency, doc_lengths, topic-term distributions and doc_topic distributions.
all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis.
input parameter is the year to do the visualisation.
"""

doc_topic = numpy.copy(self.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis]

topic_term = []
for chain in enumerate(self.topic_chains):
topic = numpy.transpose(chain.e_log_prob)
topic = topic[time]
topic = numpy.exp(topic)
topic = topic / topic.sum()
topic_term.append(topic)

term_frequency = [0] * self.vocab_len
doc_lengths = []
for doc in enumerate(self.corpus.corpus):
doc_lengths.append(len(doc))
for pair in doc:
term_frequency[pair[0]] += pair[1]

# returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency.
# these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics.
return doc_topic, numpy.array(topic_term), doc_lengths, term_frequency

def DTMcoherence(self, time):
"""
returns all topics of a particular time-slice without probabilitiy values for it to be used
for either "u_mass" or "c_v" coherence.
"""
coherence_topics = []
for topics in self.print_topics(time):
coherence_topic = []
for word, dist in topics:
coherence_topic.append(word)
coherence_topics.append(coherence_topic)

return coherence_topics

def __getitem__(self, doc):
"""
Expand Down
41 changes: 41 additions & 0 deletions gensim/models/wrappers/dtmmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,3 +303,44 @@ def show_topic(self, topicid, time, num_words=50):
def print_topic(self, topicid, time, num_words=10):
"""Return the given topic, formatted as a string."""
return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, num_words)])

def DTMvis(self, corpus, time):
"""
returns term_frequency, doc_lengths, topic-term distributions and doc_topic distributions.
all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis.
input parameter is the year to do the visualisation.
"""
topic_term = dtm_model.lambda_[:,:,time]
topic_term = numpy.exp(topic_term)
topic_term = topic_term / topic_term.sum()
topic_term = topic_term * self.num_topics

doc_topic = dtm_model.gamma_

term_frequency = [0] * self.num_terms
doc_lengths = []
for doc in enumerate(corpus):
doc_lengths.append(len(doc))
for pair in doc:
term_frequency[pair[0]] += pair[1]

# returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency.
# these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics.
return doc_topic, topic_term, doc_lengths, term_frequency

def DTMcoherence(self, time):
"""
returns all topics of a particular time-slice without probabilitiy values for it to be used
for either "u_mass" or "c_v" coherence.
TODO: because of print format right now can only return for 1st time-slice.
should we fix the coherence printing or make changes to the print statements to mirror DTM python?
"""
coherence_topics = []
for topic in enumerate(self.show_topics(times=time, formatted=False)):
coherence_topic = []
for prob, word in topic:
coherence_topic.append(word)
coherence_topics.append(coherence_topic)

return coherence_topics