Skip to content

Commit

Permalink
Feature/candidates return none (#121)
Browse files Browse the repository at this point in the history
* return None when no candidate is present; remove depricated function; update minimum frequency in default languages to 50

* update changelog

* update tests based on new minimum frequency
  • Loading branch information
barrust authored May 28, 2022
1 parent 02154d7 commit 5b9c8d5
Show file tree
Hide file tree
Showing 12 changed files with 113 additions and 131 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# pyspellchecker

## Version 0.7.0
* Backwards Combatibility Change:
* `spell.candidates` and `spell.correction` now return `None` if there are no valid corrections or candidates
* Remove misspelled words from [issue #120](https://github.com/barrust/pyspellchecker/issues/120)
* Update all default language dictionaries after updating the minimum frequency to 50 in `scripts/build_dictionary.py`

## Version 0.6.3
* Added class method to be able to get a listing of all supported languages
* Added type hinting
Expand Down
156 changes: 84 additions & 72 deletions scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,22 @@
import string
from collections import Counter

from nltk import data


STRING_PUNCTUATION = tuple(string.punctuation)
DIGETS = tuple(string.digits)
MINIMUM_FREQUENCY = 15
MINIMUM_FREQUENCY = 50


@contextlib.contextmanager
def load_file(filename, encoding="utf-8"):
""" Context manager to handle opening a gzip or text file correctly and
reading all the data
Args:
filename (str): The filename to open
encoding (str): The file encoding to use
Yields:
str: The string data from the file read
"""Context manager to handle opening a gzip or text file correctly and
reading all the data
Args:
filename (str): The filename to open
encoding (str): The file encoding to use
Yields:
str: The string data from the file read
"""
if filename[-3:].lower() == ".gz":
with gzip.open(filename, mode="rt", encoding=encoding) as fobj:
Expand All @@ -52,29 +50,29 @@ def load_file(filename, encoding="utf-8"):


def export_word_frequency(filepath, word_frequency):
""" Export a word frequency as a json object
"""Export a word frequency as a json object
Args:
filepath (str):
word_frequency (Counter):
Args:
filepath (str):
word_frequency (Counter):
"""
with open(filepath, 'w') as f:
with open(filepath, "w") as f:
json.dump(word_frequency, f, indent="", sort_keys=True, ensure_ascii=False)


def build_word_frequency(filepath, language, output_path):
""" Parse the passed in text file (likely from Open Subtitles) into
a word frequency list and write it out to disk
Args:
filepath (str):
language (str):
output_path (str):
Returns:
Counter: The word frequency as parsed from the file
Note:
This only removes words that are proper nouns (attempts to...) and
anything that starts or stops with something that is not in the alphabet.
"""Parse the passed in text file (likely from Open Subtitles) into
a word frequency list and write it out to disk
Args:
filepath (str):
language (str):
output_path (str):
Returns:
Counter: The word frequency as parsed from the file
Note:
This only removes words that are proper nouns (attempts to...) and
anything that starts or stops with something that is not in the alphabet.
"""
# NLTK is only needed in this portion of the project
try:
Expand All @@ -91,15 +89,19 @@ def build_word_frequency(filepath, language, output_path):
tok = WhitespaceTokenizer()

idx = 0
with load_file(filepath, 'utf-8') as fobj:
with load_file(filepath, "utf-8") as fobj:
for line in fobj:
# tokenize into parts
parts = tok.tokenize(line)

# Attempt to remove proper nouns
# Remove things that have leading or trailing non-alphabetic characters.
tagged_sent = pos_tag(parts)
words = [word[0].lower() for word in tagged_sent if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha()]
words = [
word[0].lower()
for word in tagged_sent
if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha()
]

# print(words)
if words:
Expand All @@ -117,7 +119,7 @@ def build_word_frequency(filepath, language, output_path):


def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
with load_file(word_freq_filepath, 'utf-8') as f:
with load_file(word_freq_filepath, "utf-8") as f:
source_word_frequency = json.load(f)

source_words = set(source_word_frequency.keys())
Expand All @@ -126,19 +128,19 @@ def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
misfitted_words = source_words.difference(final_words)
misfitted_words = sorted(list(misfitted_words))

with open(misfit_filepath, 'w+') as file:
with open(misfit_filepath, "w+") as file:
for word in misfitted_words:
file.write(word)
file.write('\n')
file.write("\n")


def clean_english(word_frequency, filepath_exclude, filepath_include):
""" Clean an English word frequency list
"""Clean an English word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyz'")

Expand All @@ -164,7 +166,7 @@ def clean_english(word_frequency, filepath_exclude, filepath_include):
# Remove double punctuations (a-a-a-able) or (a'whoppinganda'whumping)
double_punc = list()
for key in word_frequency:
if key.count("'") > 1 or key.count(".") > 2:
if key.count("'") > 1 or key.count("-") > 1 or key.count(".") > 2:
double_punc.append(key)
for misfit in double_punc:
word_frequency.pop(misfit)
Expand Down Expand Up @@ -248,12 +250,12 @@ def clean_english(word_frequency, filepath_exclude, filepath_include):


def clean_spanish(word_frequency, filepath_exclude, filepath_include):
""" Clean a Spanish word frequency list
"""Clean a Spanish word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzáéíóúüñ")

Expand Down Expand Up @@ -341,12 +343,12 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include):


def clean_german(word_frequency, filepath_exclude, filepath_include):
""" Clean a German word frequency list
"""Clean a German word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzäöüß")

Expand Down Expand Up @@ -398,12 +400,12 @@ def clean_german(word_frequency, filepath_exclude, filepath_include):


def clean_french(word_frequency, filepath_exclude, filepath_include):
""" Clean a French word frequency list
"""Clean a French word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzéàèùâêîôûëïüÿçœæ")

Expand Down Expand Up @@ -455,12 +457,12 @@ def clean_french(word_frequency, filepath_exclude, filepath_include):


def clean_portuguese(word_frequency, filepath_exclude, filepath_include):
""" Clean a Portuguese word frequency list
"""Clean a Portuguese word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzáâãàçéêíóôõú")

Expand Down Expand Up @@ -512,12 +514,12 @@ def clean_portuguese(word_frequency, filepath_exclude, filepath_include):


def clean_russian(word_frequency, filepath_exclude, filepath_include):
""" Clean an Russian word frequency list
"""Clean an Russian word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("абвгдеёжзийклмнопрстуфхцчшщъыьэюя")

Expand Down Expand Up @@ -591,11 +593,21 @@ def _parse_args():
"""parse arguments for command-line usage"""
import argparse

parser = argparse.ArgumentParser(description='Build a new dictionary (word frequency) using the OpenSubtitles2018 project')
parser.add_argument("-l", "--language", required=True, help="The language being built", choices=['en', 'es', 'de', 'fr', 'pt', 'ru'])
parser.add_argument("-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json")
parser.add_argument("-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed")
parser.add_argument("-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary")
parser = argparse.ArgumentParser(
description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
)
parser.add_argument(
"-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru"]
)
parser.add_argument(
"-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
)
parser.add_argument(
"-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed"
)
parser.add_argument(
"-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary"
)

args = parser.parse_args()

Expand All @@ -613,7 +625,7 @@ def _parse_args():
return args


if __name__ == '__main__':
if __name__ == "__main__":
args = _parse_args()

# get current path to find where the script is currently
Expand All @@ -638,12 +650,12 @@ def _parse_args():
else:
json_path = os.path.join(script_path, "data", "{}_full.json.gz".format(args.language))
print(json_path)
with load_file(json_path, 'utf-8') as f:
with load_file(json_path, "utf-8") as f:
word_frequency = json.load(f)

# create include and exclude files before cleaning
for filepath in (include_filepath, exclude_filepath):
with open(filepath, 'a+'):
with open(filepath, "a+"):
pass

# clean up the dictionary
Expand Down
3 changes: 3 additions & 0 deletions scripts/data/en_exclude.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,6 @@ suficiente
scientifiic
prophecied
lucien's
adress
helo
abcs
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__maintainer__ = "Tyler Barrus"
__email__ = "barrust@gmail.com"
__license__ = "MIT"
__version__ = "0.6.3"
__version__ = "0.7.0"
__credits__ = ["Peter Norvig"]
__url__ = "https://github.com/barrust/pyspellchecker"
__bugtrack_url__ = "{0}/issues".format(__url__)
Binary file modified spellchecker/resources/de.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/en.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/es.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/fr.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/pt.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/ru.json.gz
Binary file not shown.
42 changes: 10 additions & 32 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,7 @@
from collections import Counter
from collections.abc import Iterable

from .utils import (
KeyT,
_parse_into_words,
deprecated,
ensure_unicode,
load_file,
write_file,
)
from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file


class SpellChecker(object):
Expand Down Expand Up @@ -155,42 +148,27 @@ def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = N
word = ensure_unicode(word)
return self._word_frequency.dictionary[word] / total_words

@deprecated("Deprecated as of version 0.6.5; use word_usage_frequency instead")
def word_probability(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
"""Calculate the frequency to the `word` provided as seen across the
entire dictionary; function was a misnomar and is therefore
deprecated!
Args:
word (str): The word for which the word probability is calculated
total_words (int): The total number of words to use in thecalculation; use the default for using the whole word frequency
Returns:
float: The probability that the word is the correct word
Note:
Deprecated as of version 0.6.1; use `word_usage_frequency` instead
Note:
Will be removed in version 0.6.4"""
return self.word_usage_frequency(word, total_words)

def correction(self, word: KeyT) -> str:
def correction(self, word: KeyT) -> typing.Optional[str]:
"""The most probable correct spelling for the word
Args:
word (str): The word to correct
Returns:
str: The most likely candidate"""
str: The most likely candidate or None if no correction is present"""
word = ensure_unicode(word)
candidates = list(self.candidates(word))
return max(sorted(candidates), key=self.__getitem__)
candidates = self.candidates(word)
if not candidates:
return None
return max(sorted(list(candidates)), key=self.__getitem__)

def candidates(self, word: KeyT) -> typing.Set[str]:
def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
"""Generate possible spelling corrections for the provided word up to
an edit distance of two, if and only when needed
Args:
word (str): The word for which to calculate candidate spellings
Returns:
set: The set of words that are possible candidates"""
set: The set of words that are possible candidates or None if there are no candidates"""
word = ensure_unicode(word)
if self.known([word]): # short-cut if word is correct already
return {word}
Expand All @@ -208,7 +186,7 @@ def candidates(self, word: KeyT) -> typing.Set[str]:
tmp = self.known([x for x in self.__edit_distance_alt(res)])
if tmp:
return tmp
return {word}
return None

def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
"""The subset of `words` that appear in the dictionary of words
Expand Down
Loading

0 comments on commit 5b9c8d5

Please sign in to comment.