From 6d980d2b42fe888e0ce361d8f3e33c082f927f24 Mon Sep 17 00:00:00 2001 From: Michal Lopuszynski Date: Tue, 29 May 2018 16:21:06 +0200 Subject: [PATCH 1/5] Fix min_count handling in phrases detection using npmi --- gensim/models/phrases.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 6dc4112a15..29a11aa4d0 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -661,7 +661,7 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co len_vocab : int NOT USED. min_count: int - NOT USED. + Take into account only bigrams with count above this value. corpus_word_count : int Number of words in corpus. @@ -671,10 +671,15 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co where :math:`prob(word) = \\frac{word\_count}{corpus\_word\_count}` """ - pa = worda_count / corpus_word_count - pb = wordb_count / corpus_word_count - pab = bigram_count / corpus_word_count - return log(pab / (pa * pb)) / -log(pab) + if bigram_count > min_count: + pa = worda_count / corpus_word_count + pb = wordb_count / corpus_word_count + pab = bigram_count / corpus_word_count + return log(pab / (pa * pb)) / -log(pab) + else: + # Return the value below minimal npmi, to make sure that phrases + # will be created only out of bigrams more frequent than min_count + return -1.1 def pseudocorpus(source_vocab, sep, common_terms=frozenset()): From 99dce42b072b5600402088d4fb5bfc72edae4165 Mon Sep 17 00:00:00 2001 From: Michal Lopuszynski Date: Tue, 17 Jul 2018 22:11:19 +0200 Subject: [PATCH 2/5] Refactor min_count handling in npmi phrases detection --- gensim/models/phrases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 29a11aa4d0..556d88d01b 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -677,9 +677,9 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co pab = bigram_count / corpus_word_count return log(pab / (pa * pb)) / -log(pab) else: - # Return the value below minimal npmi, to make sure that phrases + # Return -infinity to make sure that phrases # will be created only out of bigrams more frequent than min_count - return -1.1 + return float('-inf') def pseudocorpus(source_vocab, sep, common_terms=frozenset()): From 61cdbfc6366b72bf9acdbe1c85eec6a771a05c49 Mon Sep 17 00:00:00 2001 From: Michal Lopuszynski Date: Mon, 23 Jul 2018 18:33:42 +0200 Subject: [PATCH 3/5] Fix min_count inequality for compatiblity with the rest of the gensim API --- gensim/models/phrases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 556d88d01b..796c54b29c 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -671,7 +671,7 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co where :math:`prob(word) = \\frac{word\_count}{corpus\_word\_count}` """ - if bigram_count > min_count: + if bigram_count >= min_count: pa = worda_count / corpus_word_count pb = wordb_count / corpus_word_count pab = bigram_count / corpus_word_count From 61fa8b2a03f47fdb691a29eb601906f9aab01fff Mon Sep 17 00:00:00 2001 From: Michal Lopuszynski Date: Mon, 30 Jul 2018 16:53:48 +0200 Subject: [PATCH 4/5] Fix misleading min_count doc_string --- gensim/models/phrases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 796c54b29c..a30d59fdbb 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -661,7 +661,7 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co len_vocab : int NOT USED. min_count: int - Take into account only bigrams with count above this value. + Ignore all bigrams with total collected count lower than this value. corpus_word_count : int Number of words in corpus. From 8123dad336d08f3bc47b987209ce1cc267bdce1e Mon Sep 17 00:00:00 2001 From: Michal Lopuszynski Date: Mon, 30 Jul 2018 17:11:28 +0200 Subject: [PATCH 5/5] Fix misleading min_count comment --- gensim/models/phrases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index a30d59fdbb..8bc296f886 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -677,8 +677,8 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co pab = bigram_count / corpus_word_count return log(pab / (pa * pb)) / -log(pab) else: - # Return -infinity to make sure that phrases - # will be created only out of bigrams more frequent than min_count + # Return -infinity to make sure that no phrases will be created + # from bigrams less frequent than min_count return float('-inf')