made minor changes to documentation and code in coherencemodel.

piskvorky · Jun 28, 2016 · 3482910 · 3482910
1 parent ad28cf4
commit 3482910
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 22 deletions.
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
@@ -26,7 +26,7 @@
                                     aggregation)
 from gensim.corpora import Dictionary
 from gensim.matutils import argsort
-from gensim.utils import is_corpus
+from gensim.utils import is_corpus, FakeDict
 from gensim.models.ldamodel import LdaModel
 from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
 
@@ -52,45 +52,43 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c
         """
         Args:
         ----
-        model : Pre-trained topic model.
+        model : Pre-trained topic model. Should be provided irrespective of which coherence measure is being used.
         texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator.
         corpus : Gensim document corpus.
-        dictionary : Gensim dictionary mapping of id word to create corpus.
+        dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
+                     If both are provided, dictionary will be used.
         coherence : Coherence measure to be used. Supported values are:
-                    u_mass
-                    c_v
+                    'u_mass'
+                    'c_v'
+                    For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
+                    For 'c_v' texts should be provided. Corpus is not needed.
         """
         if texts is None and corpus is None:
             raise ValueError("One of texts or corpus has to be provided.")
+        # Check if associated dictionary is provided.
+        if dictionary is None:
+            if isinstance(model.id2word, FakeDict):
+                raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
+                                 " should be set as the associated dictionary.")
+            else:
+                self.dictionary = model.id2word
+        else:
+            self.dictionary = dictionary
+        # Check for correct inputs for u_mass coherence measure.
         if coherence == 'u_mass':
             if is_corpus(corpus)[0]:
-                if dictionary is None:
-                    if model.id2word[0] == 0:
-                        raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
-                                         "should be set as the dictionary.")
-                    else:
-                        self.dictionary = model.id2word
-                else:
-                    self.dictionary = dictionary
                 self.corpus = corpus
             elif texts is not None:
                 self.texts = texts
-                if dictionary is None:
-                    self.dictionary = Dictionary(self.texts)
-                else:
-                    self.dictionary = dictionary
                 self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
             else:
                 raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)
-
+        # Check for correct inputs for c_v coherence measure.
         elif coherence == 'c_v':
             if texts is None:
                 raise ValueError("'texts' should be provided for %s coherence." % coherence)
             else:
                 self.texts = texts
-                self.dictionary = Dictionary(self.texts)
-                self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
-
         else:
             raise ValueError("%s coherence is not currently supported." % coherence)
 

diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -44,6 +44,7 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
 
 def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
     """
+    Popularly known as PMI.
     This function calculates the log-ratio-measure which is used by
     coherence measures such as c_v.
     This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]

diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -59,10 +59,14 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
     ----
     topics : Topics obtained from the trained topic model.
     segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
-    per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
+    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
     measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
     gamma : Gamma value for computing W', W* vectors.
     num_docs : Total number of documents in corresponding corpus.
+
+    Returns:
+    -------
+    s_cos_sim : array of cosine similarity of the context vectors for each segmentation
     """
     if measure == 'nlr':
         measure = direct_confirmation_measure.normalized_log_ratio_measure