Skip to content

Commit

Permalink
add parameter checks for distance; add ability to export (#20)
Browse files Browse the repository at this point in the history
* add parameter checks for distance; 
* add ability to export
* prepare for release
* add distance property to documentation
* more quickstart documentation
  • Loading branch information
barrust authored Oct 6, 2018
1 parent 31147fe commit db3e2e3
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 10 deletions.
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# pyspellchecker

## Version 0.1.5 (unreleased)
## Version 0.1.5
* Remove words based on threshold
* Add ability to iterate over words (keys) in the dictionary
* Add setting to to reduce the edit distance check
[see PR #17](https://github.com/barrust/pyspellchecker/pull/17) Thanks [@mrjamesriley](https://github.com/mrjamesriley)
[see PR #17](https://github.com/barrust/pyspellchecker/pull/17) Thanks [@mrjamesriley](https://github.com/mrjamesriley)
* Added Export functionality:
* json
* gzip
* Updated logic for loading dictionaries to be either language or local_dictionary

## Version 0.1.4
* Ability to easily remove words
Expand Down
18 changes: 16 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ German, and French. Dictionaries were generated using the `WordFrequency project
``pyspellchecker`` supports **Python 3**. If may work for Python 2.7 but it is not
guaranteed (especially for Non-English dictionaries)!

``pyspellchecker`` allows for the setting of the Levenshtein Distance to check.
For longer words, it is highly recommended to use a distance of 1 and not the
default 2. See the quickstart to find how one can change the distance parameter.


Installation
-------------------------------------------------------------------------------
Expand Down Expand Up @@ -86,10 +90,20 @@ text to generate a more appropriate list for your use case.
spell.known(['microsoft', 'google']) # will return both now!
If the words that you wish to check are long, it is recommended to reduce the
`distance` to 1. This can be accomplished either when initializing the spell
check class or after the fact.

.. code:: python
from spellchecker import SpellChecker
spell = SpellChecker(distance=1) # set at initialization
# do some work on longer words
spell.distance = 2 # set the distance parameter back to the default
More work in storing and loading word frequency lists is planned; stay
tuned.
Additional Methods
Expand Down
12 changes: 12 additions & 0 deletions docs/source/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@ Once a word is identified as misspelled, you can find the likeliest replacement:
spell.correction(word) # 'happening'
.. code:: python
from spellchecker import SpellChecker
spell = SpellChecker(distance=1) # set the Levenshtein Distance parameter
# do additional work
# now for shorter words, we can revert to Levenshtein Distance of 2!
spell.distance = 2
Or if the word identified as the likeliest is not correct, a list of candidates
can also be pulled:

Expand Down
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__maintainer__ = 'Tyler Barrus'
__email__ = 'barrust@gmail.com'
__license__ = 'MIT'
__version__ = '0.1.4'
__version__ = '0.1.5'
__credits__ = ['Peter Norvig']
__url__ = 'https://github.com/barrust/pyspellchecker'
__bugtrack_url__ = '{0}/issues'.format(__url__)
46 changes: 41 additions & 5 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ class SpellChecker(object):
for no dictionary. Supported languages are `en`, `es`, `de`, and \
`fr`. Defaults to `en`
local_dictionary (str): The path to a locally stored word \
frequency dictionary
frequency dictionary; if provided, no language will be loaded
distance (int): The edit distance to use. Defaults to 2'''


def __init__(self, language='en', local_dictionary=None, distance=2):
self._distance = distance
self._distance = None
self.distance = distance # use the setter value check
self._word_frequency = WordFrequency()
if local_dictionary:
self._word_frequency.load_dictionary(local_dictionary)
if language:
elif language:
filename = '{}.json.gz'.format(language)
here = os.path.dirname(__file__)
full_filename = os.path.join(here, 'resources', filename)
Expand All @@ -55,6 +55,27 @@ def word_frequency(self):
Not settable '''
return self._word_frequency

@property
def distance(self):
''' int: The maximum edit distance to calculate
Note:
Valid values are 1 or 2; if an invalid value is passed, \
defaults to 2 '''
return self._distance

@distance.setter
def distance(self, val):
''' set the distance parameter '''
tmp = 2
try:
int(val)
if val > 0 and val <= 2:
tmp = val
except (ValueError, TypeError):
pass
self._distance = tmp

@staticmethod
def words(text):
''' Split text into individual `words` using a simple whitespace regex
Expand All @@ -65,6 +86,20 @@ def words(text):
list(str): A listing of all words in the provided text '''
return _words(text)

def export(self, filepath, gzipped=True):
''' Export the word frequency list for import in the future
Args:
filepath (str): The filepath to the exported dictionary
gzipped (bool): Whether to gzip the dictionary or not '''
data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
if gzipped:
with gzip.open(filepath, 'wt') as fobj:
fobj.write(data)
else:
with open(filepath, 'w') as fobj:
fobj.write(data)

def word_probability(self, word, total_words=None):
''' Calculate the probability of the `word` being the desired, correct
word
Expand Down Expand Up @@ -100,7 +135,8 @@ def candidates(self, word):
set: The set of words that are possible candidates '''

return (self.known([word]) or self.known(self.edit_distance_1(word)) or
(self._distance == 2 and self.known(self.edit_distance_2(word))) or {word})
(self._distance == 2 and
self.known(self.edit_distance_2(word))) or {word})

def known(self, words):
''' The subset of `words` that appear in the dictionary of words
Expand Down
48 changes: 48 additions & 0 deletions tests/spellchecker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,22 @@ def test_edit_distance_one(self):
spell = SpellChecker(language=None, local_dictionary=filepath, distance=1)
self.assertEqual(spell.candidates('hike'), {'bike'})

def test_edit_distance_one_property(self):
''' check the property setting of the distance property '''
spell = SpellChecker(distance=1)
self.assertEqual(spell.distance, 1)
spell.distance = 2
self.assertEqual(spell.distance, 2)

def test_edit_distance_invalud(self):
''' check the property setting of the distance property on invalid inputs '''
spell = SpellChecker(distance=None)
self.assertEqual(spell.distance, 2)
spell.distance = 1
self.assertEqual(spell.distance, 1)
spell.distance = 'string'
self.assertEqual(spell.distance, 2)

def test_edit_distance_two(self):
''' test a case where edit distance must be two '''
here = os.path.dirname(__file__)
Expand Down Expand Up @@ -187,3 +203,35 @@ def test_unique_words(self):
''' test the unique word count '''
spell = SpellChecker()
self.assertEqual(spell.word_frequency.unique_words, len(list(spell.word_frequency.keys())))

def test_import_export_json(self):
''' test the export functionality as json '''
here = os.path.dirname(__file__)
filepath = '{}/resources/small_dictionary.json'.format(here)

spell = SpellChecker(language=None, local_dictionary=filepath)
spell.word_frequency.add('meh')
new_filepath = '{}/resources/small_dictionary_new.json'.format(here)
spell.export(new_filepath, gzipped=False)

sp = SpellChecker(language=None, local_dictionary=new_filepath)
self.assertTrue('meh' in sp)
self.assertFalse('bananna' in sp)

os.remove(new_filepath)

def test_import_export_gzip(self):
''' test the export functionality as gzip '''
here = os.path.dirname(__file__)
filepath = '{}/resources/small_dictionary.json'.format(here)

spell = SpellChecker(language=None, local_dictionary=filepath)
spell.word_frequency.add('meh')
new_filepath = '{}/resources/small_dictionary_new.json.gz'.format(here)
spell.export(new_filepath, gzipped=True)

sp = SpellChecker(language=None, local_dictionary=new_filepath)
self.assertTrue('meh' in sp)
self.assertFalse('bananna' in sp)

os.remove(new_filepath)

0 comments on commit db3e2e3

Please sign in to comment.