From 6d980d2b42fe888e0ce361d8f3e33c082f927f24 Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Tue, 29 May 2018 16:21:06 +0200
Subject: [PATCH 1/5] Fix min_count handling in phrases detection using npmi

---
 gensim/models/phrases.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 6dc4112a15..29a11aa4d0 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -661,7 +661,7 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
     len_vocab : int
         NOT USED.
     min_count: int
-        NOT USED.
+        Take into account only bigrams with count above this value.
     corpus_word_count : int
         Number of words in corpus.
 
@@ -671,10 +671,15 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
     where :math:`prob(word) = \\frac{word\_count}{corpus\_word\_count}`
 
     """
-    pa = worda_count / corpus_word_count
-    pb = wordb_count / corpus_word_count
-    pab = bigram_count / corpus_word_count
-    return log(pab / (pa * pb)) / -log(pab)
+    if bigram_count > min_count:
+        pa = worda_count / corpus_word_count
+        pb = wordb_count / corpus_word_count
+        pab = bigram_count / corpus_word_count
+        return log(pab / (pa * pb)) / -log(pab)
+    else:
+        # Return the value below minimal npmi, to make sure that phrases
+        # will be created only out of bigrams more frequent than min_count
+        return -1.1
 
 
 def pseudocorpus(source_vocab, sep, common_terms=frozenset()):

From 99dce42b072b5600402088d4fb5bfc72edae4165 Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Tue, 17 Jul 2018 22:11:19 +0200
Subject: [PATCH 2/5] Refactor min_count handling in npmi phrases detection

---
 gensim/models/phrases.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 29a11aa4d0..556d88d01b 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -677,9 +677,9 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
         pab = bigram_count / corpus_word_count
         return log(pab / (pa * pb)) / -log(pab)
     else:
-        # Return the value below minimal npmi, to make sure that phrases
+        # Return -infinity to make sure that phrases
         # will be created only out of bigrams more frequent than min_count
-        return -1.1
+        return float('-inf')
 
 
 def pseudocorpus(source_vocab, sep, common_terms=frozenset()):

From 61cdbfc6366b72bf9acdbe1c85eec6a771a05c49 Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Mon, 23 Jul 2018 18:33:42 +0200
Subject: [PATCH 3/5] Fix min_count inequality for compatiblity with the rest
 of the gensim API

---
 gensim/models/phrases.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 556d88d01b..796c54b29c 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -671,7 +671,7 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
     where :math:`prob(word) = \\frac{word\_count}{corpus\_word\_count}`
 
     """
-    if bigram_count > min_count:
+    if bigram_count >= min_count:
         pa = worda_count / corpus_word_count
         pb = wordb_count / corpus_word_count
         pab = bigram_count / corpus_word_count

From 61fa8b2a03f47fdb691a29eb601906f9aab01fff Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Mon, 30 Jul 2018 16:53:48 +0200
Subject: [PATCH 4/5] Fix misleading min_count doc_string

---
 gensim/models/phrases.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 796c54b29c..a30d59fdbb 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -661,7 +661,7 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
     len_vocab : int
         NOT USED.
     min_count: int
-        Take into account only bigrams with count above this value.
+        Ignore all bigrams with total collected count lower than this value.
     corpus_word_count : int
         Number of words in corpus.
 

From 8123dad336d08f3bc47b987209ce1cc267bdce1e Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Mon, 30 Jul 2018 17:11:28 +0200
Subject: [PATCH 5/5] Fix misleading min_count comment

---
 gensim/models/phrases.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index a30d59fdbb..8bc296f886 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -677,8 +677,8 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
         pab = bigram_count / corpus_word_count
         return log(pab / (pa * pb)) / -log(pab)
     else:
-        # Return -infinity to make sure that phrases
-        # will be created only out of bigrams more frequent than min_count
+        # Return -infinity to make sure that no phrases will be created
+        # from bigrams less frequent than min_count
         return float('-inf')