Skip to content

Commit

Permalink
made minor changes to documentation and code in coherencemodel.
Browse files Browse the repository at this point in the history
  • Loading branch information
devashishd12 committed Jun 28, 2016
1 parent ad28cf4 commit 3482910
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 22 deletions.
40 changes: 19 additions & 21 deletions gensim/models/coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
aggregation)
from gensim.corpora import Dictionary
from gensim.matutils import argsort
from gensim.utils import is_corpus
from gensim.utils import is_corpus, FakeDict
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet

Expand All @@ -52,45 +52,43 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c
"""
Args:
----
model : Pre-trained topic model.
model : Pre-trained topic model. Should be provided irrespective of which coherence measure is being used.
texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator.
corpus : Gensim document corpus.
dictionary : Gensim dictionary mapping of id word to create corpus.
dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
If both are provided, dictionary will be used.
coherence : Coherence measure to be used. Supported values are:
u_mass
c_v
'u_mass'
'c_v'
For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
For 'c_v' texts should be provided. Corpus is not needed.
"""
if texts is None and corpus is None:
raise ValueError("One of texts or corpus has to be provided.")
# Check if associated dictionary is provided.
if dictionary is None:
if isinstance(model.id2word, FakeDict):
raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
" should be set as the associated dictionary.")
else:
self.dictionary = model.id2word
else:
self.dictionary = dictionary
# Check for correct inputs for u_mass coherence measure.
if coherence == 'u_mass':
if is_corpus(corpus)[0]:
if dictionary is None:
if model.id2word[0] == 0:
raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
"should be set as the dictionary.")
else:
self.dictionary = model.id2word
else:
self.dictionary = dictionary
self.corpus = corpus
elif texts is not None:
self.texts = texts
if dictionary is None:
self.dictionary = Dictionary(self.texts)
else:
self.dictionary = dictionary
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
else:
raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)

# Check for correct inputs for c_v coherence measure.
elif coherence == 'c_v':
if texts is None:
raise ValueError("'texts' should be provided for %s coherence." % coherence)
else:
self.texts = texts
self.dictionary = Dictionary(self.texts)
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]

else:
raise ValueError("%s coherence is not currently supported." % coherence)

Expand Down
1 change: 1 addition & 0 deletions gensim/topic_coherence/direct_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):

def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
"""
Popularly known as PMI.
This function calculates the log-ratio-measure which is used by
coherence measures such as c_v.
This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
Expand Down
6 changes: 5 additions & 1 deletion gensim/topic_coherence/indirect_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,14 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
----
topics : Topics obtained from the trained topic model.
segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
gamma : Gamma value for computing W', W* vectors.
num_docs : Total number of documents in corresponding corpus.
Returns:
-------
s_cos_sim : array of cosine similarity of the context vectors for each segmentation
"""
if measure == 'nlr':
measure = direct_confirmation_measure.normalized_log_ratio_measure
Expand Down

0 comments on commit 3482910

Please sign in to comment.