Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/candidates return none #121

Merged
merged 3 commits into from
May 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# pyspellchecker

## Version 0.7.0
* Backwards Combatibility Change:
* `spell.candidates` and `spell.correction` now return `None` if there are no valid corrections or candidates
* Remove misspelled words from [issue #120](https://github.com/barrust/pyspellchecker/issues/120)
* Update all default language dictionaries after updating the minimum frequency to 50 in `scripts/build_dictionary.py`

## Version 0.6.3
* Added class method to be able to get a listing of all supported languages
* Added type hinting
Expand Down
156 changes: 84 additions & 72 deletions scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,22 @@
import string
from collections import Counter

from nltk import data


STRING_PUNCTUATION = tuple(string.punctuation)
DIGETS = tuple(string.digits)
MINIMUM_FREQUENCY = 15
MINIMUM_FREQUENCY = 50


@contextlib.contextmanager
def load_file(filename, encoding="utf-8"):
""" Context manager to handle opening a gzip or text file correctly and
reading all the data

Args:
filename (str): The filename to open
encoding (str): The file encoding to use
Yields:
str: The string data from the file read
"""Context manager to handle opening a gzip or text file correctly and
reading all the data

Args:
filename (str): The filename to open
encoding (str): The file encoding to use
Yields:
str: The string data from the file read
"""
if filename[-3:].lower() == ".gz":
with gzip.open(filename, mode="rt", encoding=encoding) as fobj:
Expand All @@ -52,29 +50,29 @@ def load_file(filename, encoding="utf-8"):


def export_word_frequency(filepath, word_frequency):
""" Export a word frequency as a json object
"""Export a word frequency as a json object

Args:
filepath (str):
word_frequency (Counter):
Args:
filepath (str):
word_frequency (Counter):
"""
with open(filepath, 'w') as f:
with open(filepath, "w") as f:
json.dump(word_frequency, f, indent="", sort_keys=True, ensure_ascii=False)


def build_word_frequency(filepath, language, output_path):
""" Parse the passed in text file (likely from Open Subtitles) into
a word frequency list and write it out to disk

Args:
filepath (str):
language (str):
output_path (str):
Returns:
Counter: The word frequency as parsed from the file
Note:
This only removes words that are proper nouns (attempts to...) and
anything that starts or stops with something that is not in the alphabet.
"""Parse the passed in text file (likely from Open Subtitles) into
a word frequency list and write it out to disk

Args:
filepath (str):
language (str):
output_path (str):
Returns:
Counter: The word frequency as parsed from the file
Note:
This only removes words that are proper nouns (attempts to...) and
anything that starts or stops with something that is not in the alphabet.
"""
# NLTK is only needed in this portion of the project
try:
Expand All @@ -91,15 +89,19 @@ def build_word_frequency(filepath, language, output_path):
tok = WhitespaceTokenizer()

idx = 0
with load_file(filepath, 'utf-8') as fobj:
with load_file(filepath, "utf-8") as fobj:
for line in fobj:
# tokenize into parts
parts = tok.tokenize(line)

# Attempt to remove proper nouns
# Remove things that have leading or trailing non-alphabetic characters.
tagged_sent = pos_tag(parts)
words = [word[0].lower() for word in tagged_sent if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha()]
words = [
word[0].lower()
for word in tagged_sent
if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha()
]

# print(words)
if words:
Expand All @@ -117,7 +119,7 @@ def build_word_frequency(filepath, language, output_path):


def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
with load_file(word_freq_filepath, 'utf-8') as f:
with load_file(word_freq_filepath, "utf-8") as f:
source_word_frequency = json.load(f)

source_words = set(source_word_frequency.keys())
Expand All @@ -126,19 +128,19 @@ def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
misfitted_words = source_words.difference(final_words)
misfitted_words = sorted(list(misfitted_words))

with open(misfit_filepath, 'w+') as file:
with open(misfit_filepath, "w+") as file:
for word in misfitted_words:
file.write(word)
file.write('\n')
file.write("\n")


def clean_english(word_frequency, filepath_exclude, filepath_include):
""" Clean an English word frequency list
"""Clean an English word frequency list

Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyz'")

Expand All @@ -164,7 +166,7 @@ def clean_english(word_frequency, filepath_exclude, filepath_include):
# Remove double punctuations (a-a-a-able) or (a'whoppinganda'whumping)
double_punc = list()
for key in word_frequency:
if key.count("'") > 1 or key.count(".") > 2:
if key.count("'") > 1 or key.count("-") > 1 or key.count(".") > 2:
double_punc.append(key)
for misfit in double_punc:
word_frequency.pop(misfit)
Expand Down Expand Up @@ -248,12 +250,12 @@ def clean_english(word_frequency, filepath_exclude, filepath_include):


def clean_spanish(word_frequency, filepath_exclude, filepath_include):
""" Clean a Spanish word frequency list
"""Clean a Spanish word frequency list

Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzáéíóúüñ")

Expand Down Expand Up @@ -341,12 +343,12 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include):


def clean_german(word_frequency, filepath_exclude, filepath_include):
""" Clean a German word frequency list
"""Clean a German word frequency list

Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzäöüß")

Expand Down Expand Up @@ -398,12 +400,12 @@ def clean_german(word_frequency, filepath_exclude, filepath_include):


def clean_french(word_frequency, filepath_exclude, filepath_include):
""" Clean a French word frequency list
"""Clean a French word frequency list

Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzéàèùâêîôûëïüÿçœæ")

Expand Down Expand Up @@ -455,12 +457,12 @@ def clean_french(word_frequency, filepath_exclude, filepath_include):


def clean_portuguese(word_frequency, filepath_exclude, filepath_include):
""" Clean a Portuguese word frequency list
"""Clean a Portuguese word frequency list

Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("abcdefghijklmnopqrstuvwxyzáâãàçéêíóôõú")

Expand Down Expand Up @@ -512,12 +514,12 @@ def clean_portuguese(word_frequency, filepath_exclude, filepath_include):


def clean_russian(word_frequency, filepath_exclude, filepath_include):
""" Clean an Russian word frequency list
"""Clean an Russian word frequency list

Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("абвгдеёжзийклмнопрстуфхцчшщъыьэюя")

Expand Down Expand Up @@ -591,11 +593,21 @@ def _parse_args():
"""parse arguments for command-line usage"""
import argparse

parser = argparse.ArgumentParser(description='Build a new dictionary (word frequency) using the OpenSubtitles2018 project')
parser.add_argument("-l", "--language", required=True, help="The language being built", choices=['en', 'es', 'de', 'fr', 'pt', 'ru'])
parser.add_argument("-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json")
parser.add_argument("-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed")
parser.add_argument("-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary")
parser = argparse.ArgumentParser(
description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
)
parser.add_argument(
"-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru"]
)
parser.add_argument(
"-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
)
parser.add_argument(
"-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed"
)
parser.add_argument(
"-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary"
)

args = parser.parse_args()

Expand All @@ -613,7 +625,7 @@ def _parse_args():
return args


if __name__ == '__main__':
if __name__ == "__main__":
args = _parse_args()

# get current path to find where the script is currently
Expand All @@ -638,12 +650,12 @@ def _parse_args():
else:
json_path = os.path.join(script_path, "data", "{}_full.json.gz".format(args.language))
print(json_path)
with load_file(json_path, 'utf-8') as f:
with load_file(json_path, "utf-8") as f:
word_frequency = json.load(f)

# create include and exclude files before cleaning
for filepath in (include_filepath, exclude_filepath):
with open(filepath, 'a+'):
with open(filepath, "a+"):
pass

# clean up the dictionary
Expand Down
3 changes: 3 additions & 0 deletions scripts/data/en_exclude.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,6 @@ suficiente
scientifiic
prophecied
lucien's
adress
helo
abcs
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__maintainer__ = "Tyler Barrus"
__email__ = "barrust@gmail.com"
__license__ = "MIT"
__version__ = "0.6.3"
__version__ = "0.7.0"
__credits__ = ["Peter Norvig"]
__url__ = "https://github.com/barrust/pyspellchecker"
__bugtrack_url__ = "{0}/issues".format(__url__)
Binary file modified spellchecker/resources/de.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/en.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/es.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/fr.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/pt.json.gz
Binary file not shown.
Binary file modified spellchecker/resources/ru.json.gz
Binary file not shown.
42 changes: 10 additions & 32 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,7 @@
from collections import Counter
from collections.abc import Iterable

from .utils import (
KeyT,
_parse_into_words,
deprecated,
ensure_unicode,
load_file,
write_file,
)
from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file


class SpellChecker(object):
Expand Down Expand Up @@ -155,42 +148,27 @@ def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = N
word = ensure_unicode(word)
return self._word_frequency.dictionary[word] / total_words

@deprecated("Deprecated as of version 0.6.5; use word_usage_frequency instead")
def word_probability(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
"""Calculate the frequency to the `word` provided as seen across the
entire dictionary; function was a misnomar and is therefore
deprecated!

Args:
word (str): The word for which the word probability is calculated
total_words (int): The total number of words to use in thecalculation; use the default for using the whole word frequency
Returns:
float: The probability that the word is the correct word
Note:
Deprecated as of version 0.6.1; use `word_usage_frequency` instead
Note:
Will be removed in version 0.6.4"""
return self.word_usage_frequency(word, total_words)

def correction(self, word: KeyT) -> str:
def correction(self, word: KeyT) -> typing.Optional[str]:
"""The most probable correct spelling for the word

Args:
word (str): The word to correct
Returns:
str: The most likely candidate"""
str: The most likely candidate or None if no correction is present"""
word = ensure_unicode(word)
candidates = list(self.candidates(word))
return max(sorted(candidates), key=self.__getitem__)
candidates = self.candidates(word)
if not candidates:
return None
return max(sorted(list(candidates)), key=self.__getitem__)

def candidates(self, word: KeyT) -> typing.Set[str]:
def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
"""Generate possible spelling corrections for the provided word up to
an edit distance of two, if and only when needed

Args:
word (str): The word for which to calculate candidate spellings
Returns:
set: The set of words that are possible candidates"""
set: The set of words that are possible candidates or None if there are no candidates"""
word = ensure_unicode(word)
if self.known([word]): # short-cut if word is correct already
return {word}
Expand All @@ -208,7 +186,7 @@ def candidates(self, word: KeyT) -> typing.Set[str]:
tmp = self.known([x for x in self.__edit_distance_alt(res)])
if tmp:
return tmp
return {word}
return None

def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
"""The subset of `words` that appear in the dictionary of words
Expand Down
Loading