Feature/candidates return none (#121)

* return None when no candidate is present; remove depricated function; update minimum frequency in default languages to 50 * update changelog * update tests based on new minimum frequency
barrust · May 28, 2022 · 5b9c8d5 · 5b9c8d5
1 parent 02154d7
commit 5b9c8d5
Show file tree

Hide file tree

Showing 12 changed files with 113 additions and 131 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # pyspellchecker
 
+## Version 0.7.0
+* Backwards Combatibility Change:
+    * `spell.candidates` and `spell.correction` now return `None` if there are no valid corrections or candidates
+* Remove misspelled words from [issue #120](https://github.com/barrust/pyspellchecker/issues/120)
+* Update all default language dictionaries after updating the minimum frequency to 50 in `scripts/build_dictionary.py`
+
 ## Version 0.6.3
 * Added class method to be able to get a listing of all supported languages
 * Added type hinting

diff --git a/scripts/build_dictionary.py b/scripts/build_dictionary.py
@@ -24,24 +24,22 @@
 import string
 from collections import Counter
 
-from nltk import data
-
 
 STRING_PUNCTUATION = tuple(string.punctuation)
 DIGETS = tuple(string.digits)
-MINIMUM_FREQUENCY = 15
+MINIMUM_FREQUENCY = 50
 
 
 @contextlib.contextmanager
 def load_file(filename, encoding="utf-8"):
-    """ Context manager to handle opening a gzip or text file correctly and
-        reading all the data
-
-        Args:
-            filename (str): The filename to open
-            encoding (str): The file encoding to use
-        Yields:
-            str: The string data from the file read
+    """Context manager to handle opening a gzip or text file correctly and
+    reading all the data
+
+    Args:
+        filename (str): The filename to open
+        encoding (str): The file encoding to use
+    Yields:
+        str: The string data from the file read
     """
     if filename[-3:].lower() == ".gz":
         with gzip.open(filename, mode="rt", encoding=encoding) as fobj:
@@ -52,29 +50,29 @@ def load_file(filename, encoding="utf-8"):
 
 
 def export_word_frequency(filepath, word_frequency):
-    """ Export a word frequency as a json object
+    """Export a word frequency as a json object
 
-        Args:
-            filepath (str):
-            word_frequency (Counter):
+    Args:
+        filepath (str):
+        word_frequency (Counter):
     """
-    with open(filepath, 'w') as f:
+    with open(filepath, "w") as f:
         json.dump(word_frequency, f, indent="", sort_keys=True, ensure_ascii=False)
 
 
 def build_word_frequency(filepath, language, output_path):
-    """ Parse the passed in text file (likely from Open Subtitles) into
-        a word frequency list and write it out to disk
-
-        Args:
-            filepath (str):
-            language (str):
-            output_path (str):
-        Returns:
-            Counter: The word frequency as parsed from the file
-        Note:
-            This only removes words that are proper nouns (attempts to...) and
-            anything that starts or stops with something that is not in the alphabet.
+    """Parse the passed in text file (likely from Open Subtitles) into
+    a word frequency list and write it out to disk
+
+    Args:
+        filepath (str):
+        language (str):
+        output_path (str):
+    Returns:
+        Counter: The word frequency as parsed from the file
+    Note:
+        This only removes words that are proper nouns (attempts to...) and
+        anything that starts or stops with something that is not in the alphabet.
     """
     # NLTK is only needed in this portion of the project
     try:
@@ -91,15 +89,19 @@ def build_word_frequency(filepath, language, output_path):
         tok = WhitespaceTokenizer()
 
     idx = 0
-    with load_file(filepath, 'utf-8') as fobj:
+    with load_file(filepath, "utf-8") as fobj:
         for line in fobj:
             # tokenize into parts
             parts = tok.tokenize(line)
 
             # Attempt to remove proper nouns
             # Remove things that have leading or trailing non-alphabetic characters.
             tagged_sent = pos_tag(parts)
-            words = [word[0].lower() for word in tagged_sent if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha()]
+            words = [
+                word[0].lower()
+                for word in tagged_sent
+                if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha()
+            ]
 
             # print(words)
             if words:
@@ -117,7 +119,7 @@ def build_word_frequency(filepath, language, output_path):
 
 
 def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
-    with load_file(word_freq_filepath, 'utf-8') as f:
+    with load_file(word_freq_filepath, "utf-8") as f:
         source_word_frequency = json.load(f)
 
     source_words = set(source_word_frequency.keys())
@@ -126,19 +128,19 @@ def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
     misfitted_words = source_words.difference(final_words)
     misfitted_words = sorted(list(misfitted_words))
 
-    with open(misfit_filepath, 'w+') as file:
+    with open(misfit_filepath, "w+") as file:
         for word in misfitted_words:
             file.write(word)
-            file.write('\n')
+            file.write("\n")
 
 
 def clean_english(word_frequency, filepath_exclude, filepath_include):
-    """ Clean an English word frequency list
+    """Clean an English word frequency list
 
-        Args:
-            word_frequency (Counter):
-            filepath_exclude (str):
-            filepath_include (str):
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
     """
     letters = set("abcdefghijklmnopqrstuvwxyz'")
 
@@ -164,7 +166,7 @@ def clean_english(word_frequency, filepath_exclude, filepath_include):
     # Remove double punctuations (a-a-a-able) or (a'whoppinganda'whumping)
     double_punc = list()
     for key in word_frequency:
-        if key.count("'") > 1 or key.count(".") > 2:
+        if key.count("'") > 1 or key.count("-") > 1 or key.count(".") > 2:
             double_punc.append(key)
     for misfit in double_punc:
         word_frequency.pop(misfit)
@@ -248,12 +250,12 @@ def clean_english(word_frequency, filepath_exclude, filepath_include):
 
 
 def clean_spanish(word_frequency, filepath_exclude, filepath_include):
-    """ Clean a Spanish word frequency list
+    """Clean a Spanish word frequency list
 
-        Args:
-            word_frequency (Counter):
-            filepath_exclude (str):
-            filepath_include (str):
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
     """
     letters = set("abcdefghijklmnopqrstuvwxyzáéíóúüñ")
 
@@ -341,12 +343,12 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include):
 
 
 def clean_german(word_frequency, filepath_exclude, filepath_include):
-    """ Clean a German word frequency list
+    """Clean a German word frequency list
 
-        Args:
-            word_frequency (Counter):
-            filepath_exclude (str):
-            filepath_include (str):
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
     """
     letters = set("abcdefghijklmnopqrstuvwxyzäöüß")
 
@@ -398,12 +400,12 @@ def clean_german(word_frequency, filepath_exclude, filepath_include):
 
 
 def clean_french(word_frequency, filepath_exclude, filepath_include):
-    """ Clean a French word frequency list
+    """Clean a French word frequency list
 
-        Args:
-            word_frequency (Counter):
-            filepath_exclude (str):
-            filepath_include (str):
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
     """
     letters = set("abcdefghijklmnopqrstuvwxyzéàèùâêîôûëïüÿçœæ")
 
@@ -455,12 +457,12 @@ def clean_french(word_frequency, filepath_exclude, filepath_include):
 
 
 def clean_portuguese(word_frequency, filepath_exclude, filepath_include):
-    """ Clean a Portuguese word frequency list
+    """Clean a Portuguese word frequency list
 
-        Args:
-            word_frequency (Counter):
-            filepath_exclude (str):
-            filepath_include (str):
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
     """
     letters = set("abcdefghijklmnopqrstuvwxyzáâãàçéêíóôõú")
 
@@ -512,12 +514,12 @@ def clean_portuguese(word_frequency, filepath_exclude, filepath_include):
 
 
 def clean_russian(word_frequency, filepath_exclude, filepath_include):
-    """ Clean an Russian word frequency list
+    """Clean an Russian word frequency list
 
-        Args:
-            word_frequency (Counter):
-            filepath_exclude (str):
-            filepath_include (str):
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
     """
     letters = set("абвгдеёжзийклмнопрстуфхцчшщъыьэюя")
 
@@ -591,11 +593,21 @@ def _parse_args():
     """parse arguments for command-line usage"""
     import argparse
 
-    parser = argparse.ArgumentParser(description='Build a new dictionary (word frequency) using the OpenSubtitles2018 project')
-    parser.add_argument("-l", "--language", required=True, help="The language being built", choices=['en', 'es', 'de', 'fr', 'pt', 'ru'])
-    parser.add_argument("-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json")
-    parser.add_argument("-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed")
-    parser.add_argument("-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary")
+    parser = argparse.ArgumentParser(
+        description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
+    )
+    parser.add_argument(
+        "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru"]
+    )
+    parser.add_argument(
+        "-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
+    )
+    parser.add_argument(
+        "-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed"
+    )
+    parser.add_argument(
+        "-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary"
+    )
 
     args = parser.parse_args()
 
@@ -613,7 +625,7 @@ def _parse_args():
     return args
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = _parse_args()
 
     # get current path to find where the script is currently
@@ -638,12 +650,12 @@ def _parse_args():
     else:
         json_path = os.path.join(script_path, "data", "{}_full.json.gz".format(args.language))
         print(json_path)
-        with load_file(json_path, 'utf-8') as f:
+        with load_file(json_path, "utf-8") as f:
             word_frequency = json.load(f)
 
     # create include and exclude files before cleaning
     for filepath in (include_filepath, exclude_filepath):
-        with open(filepath, 'a+'):
+        with open(filepath, "a+"):
             pass
 
     # clean up the dictionary

diff --git a/scripts/data/en_exclude.txt b/scripts/data/en_exclude.txt
@@ -152,3 +152,6 @@ suficiente
 scientifiic
 prophecied
 lucien's
+adress
+helo
+abcs
diff --git a/spellchecker/info.py b/spellchecker/info.py
@@ -5,7 +5,7 @@
 __maintainer__ = "Tyler Barrus"
 __email__ = "barrust@gmail.com"
 __license__ = "MIT"
-__version__ = "0.6.3"
+__version__ = "0.7.0"
 __credits__ = ["Peter Norvig"]
 __url__ = "https://github.com/barrust/pyspellchecker"
 __bugtrack_url__ = "{0}/issues".format(__url__)
diff --git a/spellchecker/resources/de.json.gz b/spellchecker/resources/de.json.gz
diff --git a/spellchecker/resources/en.json.gz b/spellchecker/resources/en.json.gz
diff --git a/spellchecker/resources/es.json.gz b/spellchecker/resources/es.json.gz
diff --git a/spellchecker/resources/fr.json.gz b/spellchecker/resources/fr.json.gz
diff --git a/spellchecker/resources/pt.json.gz b/spellchecker/resources/pt.json.gz
diff --git a/spellchecker/resources/ru.json.gz b/spellchecker/resources/ru.json.gz
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -8,14 +8,7 @@
 from collections import Counter
 from collections.abc import Iterable
 
-from .utils import (
-    KeyT,
-    _parse_into_words,
-    deprecated,
-    ensure_unicode,
-    load_file,
-    write_file,
-)
+from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file
 
 
 class SpellChecker(object):
@@ -155,42 +148,27 @@ def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = N
         word = ensure_unicode(word)
         return self._word_frequency.dictionary[word] / total_words
 
-    @deprecated("Deprecated as of version 0.6.5; use word_usage_frequency instead")
-    def word_probability(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
-        """Calculate the frequency to the `word` provided as seen across the
-        entire dictionary; function was a misnomar and is therefore
-        deprecated!
-
-        Args:
-            word (str): The word for which the word probability is calculated
-            total_words (int): The total number of words to use in thecalculation; use the default for using the whole word frequency
-        Returns:
-            float: The probability that the word is the correct word
-        Note:
-            Deprecated as of version 0.6.1; use `word_usage_frequency` instead
-        Note:
-            Will be removed in version 0.6.4"""
-        return self.word_usage_frequency(word, total_words)
-
-    def correction(self, word: KeyT) -> str:
+    def correction(self, word: KeyT) -> typing.Optional[str]:
         """The most probable correct spelling for the word
 
         Args:
             word (str): The word to correct
         Returns:
-            str: The most likely candidate"""
+            str: The most likely candidate or None if no correction is present"""
         word = ensure_unicode(word)
-        candidates = list(self.candidates(word))
-        return max(sorted(candidates), key=self.__getitem__)
+        candidates = self.candidates(word)
+        if not candidates:
+            return None
+        return max(sorted(list(candidates)), key=self.__getitem__)
 
-    def candidates(self, word: KeyT) -> typing.Set[str]:
+    def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
         """Generate possible spelling corrections for the provided word up to
         an edit distance of two, if and only when needed
 
         Args:
             word (str): The word for which to calculate candidate spellings
         Returns:
-            set: The set of words that are possible candidates"""
+            set: The set of words that are possible candidates or None if there are no candidates"""
         word = ensure_unicode(word)
         if self.known([word]):  # short-cut if word is correct already
             return {word}
@@ -208,7 +186,7 @@ def candidates(self, word: KeyT) -> typing.Set[str]:
             tmp = self.known([x for x in self.__edit_distance_alt(res)])
             if tmp:
                 return tmp
-        return {word}
+        return None
 
     def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
         """The subset of `words` that appear in the dictionary of words