diff --git a/CHANGELOG.md b/CHANGELOG.md index ec9c12d..baa7003 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # pyspellchecker +## Version 0.7.0 +* Backwards Combatibility Change: + * `spell.candidates` and `spell.correction` now return `None` if there are no valid corrections or candidates +* Remove misspelled words from [issue #120](https://github.com/barrust/pyspellchecker/issues/120) +* Update all default language dictionaries after updating the minimum frequency to 50 in `scripts/build_dictionary.py` + ## Version 0.6.3 * Added class method to be able to get a listing of all supported languages * Added type hinting diff --git a/scripts/build_dictionary.py b/scripts/build_dictionary.py index dc6e7fa..e5e6a57 100644 --- a/scripts/build_dictionary.py +++ b/scripts/build_dictionary.py @@ -24,24 +24,22 @@ import string from collections import Counter -from nltk import data - STRING_PUNCTUATION = tuple(string.punctuation) DIGETS = tuple(string.digits) -MINIMUM_FREQUENCY = 15 +MINIMUM_FREQUENCY = 50 @contextlib.contextmanager def load_file(filename, encoding="utf-8"): - """ Context manager to handle opening a gzip or text file correctly and - reading all the data - - Args: - filename (str): The filename to open - encoding (str): The file encoding to use - Yields: - str: The string data from the file read + """Context manager to handle opening a gzip or text file correctly and + reading all the data + + Args: + filename (str): The filename to open + encoding (str): The file encoding to use + Yields: + str: The string data from the file read """ if filename[-3:].lower() == ".gz": with gzip.open(filename, mode="rt", encoding=encoding) as fobj: @@ -52,29 +50,29 @@ def load_file(filename, encoding="utf-8"): def export_word_frequency(filepath, word_frequency): - """ Export a word frequency as a json object + """Export a word frequency as a json object - Args: - filepath (str): - word_frequency (Counter): + Args: + filepath (str): + word_frequency (Counter): """ - with open(filepath, 'w') as f: + with open(filepath, "w") as f: json.dump(word_frequency, f, indent="", sort_keys=True, ensure_ascii=False) def build_word_frequency(filepath, language, output_path): - """ Parse the passed in text file (likely from Open Subtitles) into - a word frequency list and write it out to disk - - Args: - filepath (str): - language (str): - output_path (str): - Returns: - Counter: The word frequency as parsed from the file - Note: - This only removes words that are proper nouns (attempts to...) and - anything that starts or stops with something that is not in the alphabet. + """Parse the passed in text file (likely from Open Subtitles) into + a word frequency list and write it out to disk + + Args: + filepath (str): + language (str): + output_path (str): + Returns: + Counter: The word frequency as parsed from the file + Note: + This only removes words that are proper nouns (attempts to...) and + anything that starts or stops with something that is not in the alphabet. """ # NLTK is only needed in this portion of the project try: @@ -91,7 +89,7 @@ def build_word_frequency(filepath, language, output_path): tok = WhitespaceTokenizer() idx = 0 - with load_file(filepath, 'utf-8') as fobj: + with load_file(filepath, "utf-8") as fobj: for line in fobj: # tokenize into parts parts = tok.tokenize(line) @@ -99,7 +97,11 @@ def build_word_frequency(filepath, language, output_path): # Attempt to remove proper nouns # Remove things that have leading or trailing non-alphabetic characters. tagged_sent = pos_tag(parts) - words = [word[0].lower() for word in tagged_sent if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha()] + words = [ + word[0].lower() + for word in tagged_sent + if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha() + ] # print(words) if words: @@ -117,7 +119,7 @@ def build_word_frequency(filepath, language, output_path): def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency): - with load_file(word_freq_filepath, 'utf-8') as f: + with load_file(word_freq_filepath, "utf-8") as f: source_word_frequency = json.load(f) source_words = set(source_word_frequency.keys()) @@ -126,19 +128,19 @@ def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency): misfitted_words = source_words.difference(final_words) misfitted_words = sorted(list(misfitted_words)) - with open(misfit_filepath, 'w+') as file: + with open(misfit_filepath, "w+") as file: for word in misfitted_words: file.write(word) - file.write('\n') + file.write("\n") def clean_english(word_frequency, filepath_exclude, filepath_include): - """ Clean an English word frequency list + """Clean an English word frequency list - Args: - word_frequency (Counter): - filepath_exclude (str): - filepath_include (str): + Args: + word_frequency (Counter): + filepath_exclude (str): + filepath_include (str): """ letters = set("abcdefghijklmnopqrstuvwxyz'") @@ -164,7 +166,7 @@ def clean_english(word_frequency, filepath_exclude, filepath_include): # Remove double punctuations (a-a-a-able) or (a'whoppinganda'whumping) double_punc = list() for key in word_frequency: - if key.count("'") > 1 or key.count(".") > 2: + if key.count("'") > 1 or key.count("-") > 1 or key.count(".") > 2: double_punc.append(key) for misfit in double_punc: word_frequency.pop(misfit) @@ -248,12 +250,12 @@ def clean_english(word_frequency, filepath_exclude, filepath_include): def clean_spanish(word_frequency, filepath_exclude, filepath_include): - """ Clean a Spanish word frequency list + """Clean a Spanish word frequency list - Args: - word_frequency (Counter): - filepath_exclude (str): - filepath_include (str): + Args: + word_frequency (Counter): + filepath_exclude (str): + filepath_include (str): """ letters = set("abcdefghijklmnopqrstuvwxyzáéíóúüñ") @@ -341,12 +343,12 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include): def clean_german(word_frequency, filepath_exclude, filepath_include): - """ Clean a German word frequency list + """Clean a German word frequency list - Args: - word_frequency (Counter): - filepath_exclude (str): - filepath_include (str): + Args: + word_frequency (Counter): + filepath_exclude (str): + filepath_include (str): """ letters = set("abcdefghijklmnopqrstuvwxyzäöüß") @@ -398,12 +400,12 @@ def clean_german(word_frequency, filepath_exclude, filepath_include): def clean_french(word_frequency, filepath_exclude, filepath_include): - """ Clean a French word frequency list + """Clean a French word frequency list - Args: - word_frequency (Counter): - filepath_exclude (str): - filepath_include (str): + Args: + word_frequency (Counter): + filepath_exclude (str): + filepath_include (str): """ letters = set("abcdefghijklmnopqrstuvwxyzéàèùâêîôûëïüÿçœæ") @@ -455,12 +457,12 @@ def clean_french(word_frequency, filepath_exclude, filepath_include): def clean_portuguese(word_frequency, filepath_exclude, filepath_include): - """ Clean a Portuguese word frequency list + """Clean a Portuguese word frequency list - Args: - word_frequency (Counter): - filepath_exclude (str): - filepath_include (str): + Args: + word_frequency (Counter): + filepath_exclude (str): + filepath_include (str): """ letters = set("abcdefghijklmnopqrstuvwxyzáâãàçéêíóôõú") @@ -512,12 +514,12 @@ def clean_portuguese(word_frequency, filepath_exclude, filepath_include): def clean_russian(word_frequency, filepath_exclude, filepath_include): - """ Clean an Russian word frequency list + """Clean an Russian word frequency list - Args: - word_frequency (Counter): - filepath_exclude (str): - filepath_include (str): + Args: + word_frequency (Counter): + filepath_exclude (str): + filepath_include (str): """ letters = set("абвгдеёжзийклмнопрстуфхцчшщъыьэюя") @@ -591,11 +593,21 @@ def _parse_args(): """parse arguments for command-line usage""" import argparse - parser = argparse.ArgumentParser(description='Build a new dictionary (word frequency) using the OpenSubtitles2018 project') - parser.add_argument("-l", "--language", required=True, help="The language being built", choices=['en', 'es', 'de', 'fr', 'pt', 'ru']) - parser.add_argument("-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json") - parser.add_argument("-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed") - parser.add_argument("-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary") + parser = argparse.ArgumentParser( + description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project" + ) + parser.add_argument( + "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru"] + ) + parser.add_argument( + "-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json" + ) + parser.add_argument( + "-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed" + ) + parser.add_argument( + "-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary" + ) args = parser.parse_args() @@ -613,7 +625,7 @@ def _parse_args(): return args -if __name__ == '__main__': +if __name__ == "__main__": args = _parse_args() # get current path to find where the script is currently @@ -638,12 +650,12 @@ def _parse_args(): else: json_path = os.path.join(script_path, "data", "{}_full.json.gz".format(args.language)) print(json_path) - with load_file(json_path, 'utf-8') as f: + with load_file(json_path, "utf-8") as f: word_frequency = json.load(f) # create include and exclude files before cleaning for filepath in (include_filepath, exclude_filepath): - with open(filepath, 'a+'): + with open(filepath, "a+"): pass # clean up the dictionary diff --git a/scripts/data/en_exclude.txt b/scripts/data/en_exclude.txt index f6cf56f..d9fe68a 100644 --- a/scripts/data/en_exclude.txt +++ b/scripts/data/en_exclude.txt @@ -152,3 +152,6 @@ suficiente scientifiic prophecied lucien's +adress +helo +abcs \ No newline at end of file diff --git a/spellchecker/info.py b/spellchecker/info.py index 3a93f93..9d25beb 100644 --- a/spellchecker/info.py +++ b/spellchecker/info.py @@ -5,7 +5,7 @@ __maintainer__ = "Tyler Barrus" __email__ = "barrust@gmail.com" __license__ = "MIT" -__version__ = "0.6.3" +__version__ = "0.7.0" __credits__ = ["Peter Norvig"] __url__ = "https://github.com/barrust/pyspellchecker" __bugtrack_url__ = "{0}/issues".format(__url__) diff --git a/spellchecker/resources/de.json.gz b/spellchecker/resources/de.json.gz index cbea828..e98c4bc 100644 Binary files a/spellchecker/resources/de.json.gz and b/spellchecker/resources/de.json.gz differ diff --git a/spellchecker/resources/en.json.gz b/spellchecker/resources/en.json.gz index 2d8dcfc..5214a95 100644 Binary files a/spellchecker/resources/en.json.gz and b/spellchecker/resources/en.json.gz differ diff --git a/spellchecker/resources/es.json.gz b/spellchecker/resources/es.json.gz index 6adac25..7e8dc59 100644 Binary files a/spellchecker/resources/es.json.gz and b/spellchecker/resources/es.json.gz differ diff --git a/spellchecker/resources/fr.json.gz b/spellchecker/resources/fr.json.gz index 099ab61..61b6da4 100644 Binary files a/spellchecker/resources/fr.json.gz and b/spellchecker/resources/fr.json.gz differ diff --git a/spellchecker/resources/pt.json.gz b/spellchecker/resources/pt.json.gz index 3e87c11..a3b1b19 100644 Binary files a/spellchecker/resources/pt.json.gz and b/spellchecker/resources/pt.json.gz differ diff --git a/spellchecker/resources/ru.json.gz b/spellchecker/resources/ru.json.gz index 0335ed5..da5afd8 100644 Binary files a/spellchecker/resources/ru.json.gz and b/spellchecker/resources/ru.json.gz differ diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py index 4215327..37beb2e 100644 --- a/spellchecker/spellchecker.py +++ b/spellchecker/spellchecker.py @@ -8,14 +8,7 @@ from collections import Counter from collections.abc import Iterable -from .utils import ( - KeyT, - _parse_into_words, - deprecated, - ensure_unicode, - load_file, - write_file, -) +from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file class SpellChecker(object): @@ -155,42 +148,27 @@ def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = N word = ensure_unicode(word) return self._word_frequency.dictionary[word] / total_words - @deprecated("Deprecated as of version 0.6.5; use word_usage_frequency instead") - def word_probability(self, word: KeyT, total_words: typing.Optional[int] = None) -> float: - """Calculate the frequency to the `word` provided as seen across the - entire dictionary; function was a misnomar and is therefore - deprecated! - - Args: - word (str): The word for which the word probability is calculated - total_words (int): The total number of words to use in thecalculation; use the default for using the whole word frequency - Returns: - float: The probability that the word is the correct word - Note: - Deprecated as of version 0.6.1; use `word_usage_frequency` instead - Note: - Will be removed in version 0.6.4""" - return self.word_usage_frequency(word, total_words) - - def correction(self, word: KeyT) -> str: + def correction(self, word: KeyT) -> typing.Optional[str]: """The most probable correct spelling for the word Args: word (str): The word to correct Returns: - str: The most likely candidate""" + str: The most likely candidate or None if no correction is present""" word = ensure_unicode(word) - candidates = list(self.candidates(word)) - return max(sorted(candidates), key=self.__getitem__) + candidates = self.candidates(word) + if not candidates: + return None + return max(sorted(list(candidates)), key=self.__getitem__) - def candidates(self, word: KeyT) -> typing.Set[str]: + def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]: """Generate possible spelling corrections for the provided word up to an edit distance of two, if and only when needed Args: word (str): The word for which to calculate candidate spellings Returns: - set: The set of words that are possible candidates""" + set: The set of words that are possible candidates or None if there are no candidates""" word = ensure_unicode(word) if self.known([word]): # short-cut if word is correct already return {word} @@ -208,7 +186,7 @@ def candidates(self, word: KeyT) -> typing.Set[str]: tmp = self.known([x for x in self.__edit_distance_alt(res)]) if tmp: return tmp - return {word} + return None def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]: """The subset of `words` that appear in the dictionary of words diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py index 3930bfb..4fc37df 100644 --- a/tests/spellchecker_test.py +++ b/tests/spellchecker_test.py @@ -5,7 +5,6 @@ import os from spellchecker import SpellChecker -from spellchecker.utils import fail_after class TestSpellChecker(unittest.TestCase): @@ -29,30 +28,24 @@ def test_candidates(self): "tes", "thas", "tis", - "thse", "thes", "thus", - "ohs", "thu", "thy", "thi", "tas", "tus", "thos", - "ahs", "tho", "tha", - "thsi", - "tos", "the", "this", - "iths", } self.assertEqual(spell.candidates("ths"), cands) self.assertEqual(spell.candidates("the"), {"the"}) self.assertEqual(spell.candidates("-"), {"-"}) - # something that cannot exist... should return just the same element... - self.assertEqual(spell.candidates("manasaeds"), {"manasaeds"}) + # something that cannot exist... should return None... + self.assertEqual(spell.candidates("manasaeds"), None) def test_words(self): """test the parsing of words""" @@ -80,16 +73,6 @@ def test_word_usage_frequency(self): denom = spell.word_frequency.total_words self.assertEqual(spell.word_usage_frequency("the"), num / denom) - # deprecated! - @fail_after("0.6.4") - def test_word_probability_calc(self): - """test the word probability calculation""" - spell = SpellChecker() - # if the default load changes so will this... - num = spell.word_frequency["the"] - denom = spell.word_frequency.total_words - self.assertEqual(spell.word_probability("the"), num / denom) - def test_word_known(self): """test if the word is a `known` word or not""" spell = SpellChecker() @@ -254,13 +237,13 @@ def test_remove_by_threshold(self): spell = SpellChecker() cnt = 0 for key in spell.word_frequency.keys(): - if spell.word_frequency[key] < 30: + if spell.word_frequency[key] < 300: cnt += 1 self.assertGreater(cnt, 0) - spell.word_frequency.remove_by_threshold(30) + spell.word_frequency.remove_by_threshold(300) cnt = 0 for key in spell.word_frequency.words(): # synonym for keys - if spell.word_frequency[key] < 30: + if spell.word_frequency[key] < 300: cnt += 1 self.assertEqual(cnt, 0) @@ -269,13 +252,13 @@ def test_remove_by_threshold_using_items(self): spell = SpellChecker() cnt = 0 for _, val in spell.word_frequency.items(): - if val < 30: + if val < 300: cnt += 1 self.assertGreater(cnt, 0) - spell.word_frequency.remove_by_threshold(30) + spell.word_frequency.remove_by_threshold(300) cnt = 0 for _, val in spell.word_frequency.items(): # synonym for keys - if val < 30: + if val < 300: cnt += 1 self.assertEqual(cnt, 0) @@ -363,7 +346,7 @@ def test_large_words(self): self.assertEqual(spell.correction("bobb"), "bob") self.assertEqual(spell.correction("bobby"), "bob") self.assertEqual(spell.word_frequency.longest_word_length, 3) - self.assertEqual(spell.correction("bobbys"), "bobbys") + self.assertIsNone(spell.correction("bobbys")) def test_extremely_large_words(self): """test when a word is just extreamly large"""