Skip to content

Commit

Permalink
add Portugues; encoding; __slots__
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Nov 6, 2018
1 parent 01a5970 commit 1603837
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install:
- pip install -r requirements/requirements-dev.txt

script:
- coverage run --source=spellchecker setup.py test
- coverage run --source=spellchecker setup.py test

# commands to run after the tests successfully complete
after_success:
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# pyspellchecker

## Version 0.2.0
* Changed `words` function to `split_words` to differentiate with the `word_frequency.words` function
* Added ***Portuguese*** dictionary: `pt`
* Add encoding argument to `gzip.open` and `open` dictionary loading and exporting
* Use of __slots__ for class objects

## Version 0.1.5
* Remove words based on threshold
* Add ability to iterate over words (keys) in the dictionary
Expand Down
3 changes: 2 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ list. Those words that are found more often in the frequency list are
**more likely** the correct results.

``pyspellchecker`` supports multiple languages including English, Spanish,
German, and French. Dictionaries were generated using the `WordFrequency project <https://github.com/hermitdave/FrequencyWords>`__ on GitHub.
German, French, and Portuguese. Dictionaries were generated using
the `WordFrequency project <https://github.com/hermitdave/FrequencyWords>`__ on GitHub.

``pyspellchecker`` supports **Python 3**. If may work for Python 2.7 but it is not
guaranteed (especially for Non-English dictionaries)!
Expand Down
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__maintainer__ = 'Tyler Barrus'
__email__ = 'barrust@gmail.com'
__license__ = 'MIT'
__version__ = '0.1.5'
__version__ = '0.2.0'
__credits__ = ['Peter Norvig']
__url__ = 'https://github.com/barrust/pyspellchecker'
__bugtrack_url__ = '{0}/issues'.format(__url__)
Binary file added spellchecker/resources/pt.json.gz
Binary file not shown.
36 changes: 21 additions & 15 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ class SpellChecker(object):
Args:
language (str): The language of the dictionary to load or None \
for no dictionary. Supported languages are `en`, `es`, `de`, and \
`fr`. Defaults to `en`
for no dictionary. Supported languages are `en`, `es`, `de`, fr` \
and `pt`. Defaults to `en`
local_dictionary (str): The path to a locally stored word \
frequency dictionary; if provided, no language will be loaded
distance (int): The edit distance to use. Defaults to 2'''
distance (int): The edit distance to use. Defaults to 2 '''

__slots__ = ['_distance', '_word_frequency']

def __init__(self, language='en', local_dictionary=None, distance=2):
self._distance = None
Expand Down Expand Up @@ -77,7 +79,7 @@ def distance(self, val):
self._distance = tmp

@staticmethod
def words(text):
def split_words(text):
''' Split text into individual `words` using a simple whitespace regex
Args:
Expand All @@ -86,18 +88,19 @@ def words(text):
list(str): A listing of all words in the provided text '''
return _words(text)

def export(self, filepath, gzipped=True):
def export(self, filepath, encoding='utf-8', gzipped=True):
''' Export the word frequency list for import in the future
Args:
filepath (str): The filepath to the exported dictionary
encoding (str): The encoding of the resulting output
gzipped (bool): Whether to gzip the dictionary or not '''
data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
if gzipped:
with gzip.open(filepath, 'wt') as fobj:
with gzip.open(filepath, 'wt', encoding=encoding) as fobj:
fobj.write(data)
else:
with open(filepath, 'w') as fobj:
with open(filepath, 'w', encoding=encoding) as fobj:
fobj.write(data)

def word_probability(self, word, total_words=None):
Expand Down Expand Up @@ -210,6 +213,7 @@ class WordFrequency(object):
''' Store the `dictionary` as a word frequency list while allowing for
different methods to load the data and update over time '''

__slots__ = ['_dictionary', '_total_words', '_unique_words', '_letters']
def __init__(self):
self._dictionary = Counter()
self._total_words = 0
Expand Down Expand Up @@ -278,27 +282,29 @@ def words(self):
for word in self._dictionary.keys():
yield word

def load_dictionary(self, filename):
def load_dictionary(self, filename, encoding='utf-8'):
''' Load in a pre-built word frequency list
Args:
filename (str): The filepath to the json (optionally gzipped) \
file to be loaded '''
file to be loaded
encoding (str): The encoding of the dictionary '''
try:
with gzip.open(filename, 'rt') as fobj:
with gzip.open(filename, mode='rt', encoding=encoding) as fobj:
data = fobj.read().lower()
except OSError:
with open(filename, 'r') as fobj:
with open(filename, mode='r', encoding=encoding) as fobj:
data = fobj.read().lower()
self._dictionary.update(json.loads(data, encoding='utf8'))
self._dictionary.update(json.loads(data, encoding=encoding))
self._update_dictionary()

def load_text_file(self, filename):
def load_text_file(self, filename, encoding='utf-8'):
''' Load in a text file from which to generate a word frequency list
Args:
filename (str): The filepath to the text file to be loaded '''
with open(filename, 'r') as fobj:
filename (str): The filepath to the text file to be loaded
encoding (str): The encoding of the text file '''
with open(filename, 'r', encoding=encoding) as fobj:
self.load_text(fobj.read())

def load_text(self, text):
Expand Down
2 changes: 1 addition & 1 deletion tests/spellchecker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_words(self):
''' rest the parsing of words '''
spell = SpellChecker()
res = ['this', 'is', 'a', 'test', 'of', 'this']
self.assertEqual(spell.words('This is a test of this'), res)
self.assertEqual(spell.split_words('This is a test of this'), res)

def test_word_frequency(self):
''' test word frequency '''
Expand Down

0 comments on commit 1603837

Please sign in to comment.