From 7a54ead888065354294bf820293a3ee2264cd59b Mon Sep 17 00:00:00 2001 From: ShubhamShaswat Date: Sun, 5 Jul 2020 02:31:52 +0530 Subject: [PATCH 1/3] added idf-smooth --- machine_learning/word_frequency_functions.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index e9e9e644b7d8..361202ffc6ed 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -83,16 +83,17 @@ def document_frequency(term: str, corpus: str) -> int: return (len([doc for doc in docs if term in doc]), len(docs)) -def inverse_document_frequency(df: int, N: int) -> float: +def inverse_document_frequency(df: int, N: int, smoothing=False) -> float: """ Return an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is the Document Frequency. - @params : df, the Document Frequency, and N, - the number of documents in the corpus. - @returns : log10(N/df) + @params : df, the Document Frequency, N, + the number of documents in the corpus and + smoothing, if True return the idf-smooth + @returns : log10(N/df) or 1+log10(N/1+df) @examples : >>> inverse_document_frequency(3, 0) Traceback (most recent call last): @@ -104,7 +105,14 @@ def inverse_document_frequency(df: int, N: int) -> float: Traceback (most recent call last): ... ZeroDivisionError: df must be > 0 + >>> inverse_document_frequency(0, 3,True) + 1.477 """ + if smoothing == True: + if N == 0: + raise ValueError("log10(0) is undefined.") + return round(1 + log10(N / (1+df)), 3) + if df == 0: raise ZeroDivisionError("df must be > 0") elif N == 0: From 4eb91e9310929bd2b2015be0f649a58b95a5b248 Mon Sep 17 00:00:00 2001 From: ShubhamShaswat Date: Sun, 5 Jul 2020 02:48:08 +0530 Subject: [PATCH 2/3] added idf-smooth --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 361202ffc6ed..fd05d9cd20fe 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -108,7 +108,7 @@ def inverse_document_frequency(df: int, N: int, smoothing=False) -> float: >>> inverse_document_frequency(0, 3,True) 1.477 """ - if smoothing == True: + if smoothing: if N == 0: raise ValueError("log10(0) is undefined.") return round(1 + log10(N / (1+df)), 3) From 43bc139bbd71730ea868cc668d7e69d9eb726b2b Mon Sep 17 00:00:00 2001 From: ShubhamShaswat Date: Sun, 5 Jul 2020 02:56:07 +0530 Subject: [PATCH 3/3] added idf-smooth --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index fd05d9cd20fe..9cf7b694c6be 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -111,7 +111,7 @@ def inverse_document_frequency(df: int, N: int, smoothing=False) -> float: if smoothing: if N == 0: raise ValueError("log10(0) is undefined.") - return round(1 + log10(N / (1+df)), 3) + return round(1 + log10(N / (1 + df)), 3) if df == 0: raise ZeroDivisionError("df must be > 0")