diff --git a/.travis.yml b/.travis.yml index b669a7e..66b8838 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install -r requirements/requirements-dev.txt script: - - coverage run --source=spellchecker setup.py test + - coverage run --source=spellchecker setup.py test # commands to run after the tests successfully complete after_success: diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bd9803..6c94a66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # pyspellchecker +## Version 0.2.0 +* Changed `words` function to `split_words` to differentiate with the `word_frequency.words` function +* Added ***Portuguese*** dictionary: `pt` +* Add encoding argument to `gzip.open` and `open` dictionary loading and exporting +* Use of __slots__ for class objects + ## Version 0.1.5 * Remove words based on threshold * Add ability to iterate over words (keys) in the dictionary diff --git a/README.rst b/README.rst index 35ec032..1484ac9 100644 --- a/README.rst +++ b/README.rst @@ -21,7 +21,8 @@ list. Those words that are found more often in the frequency list are **more likely** the correct results. ``pyspellchecker`` supports multiple languages including English, Spanish, -German, and French. Dictionaries were generated using the `WordFrequency project `__ on GitHub. +German, French, and Portuguese. Dictionaries were generated using +the `WordFrequency project `__ on GitHub. ``pyspellchecker`` supports **Python 3**. If may work for Python 2.7 but it is not guaranteed (especially for Non-English dictionaries)! diff --git a/spellchecker/info.py b/spellchecker/info.py index 31d0109..875a0bf 100644 --- a/spellchecker/info.py +++ b/spellchecker/info.py @@ -5,7 +5,7 @@ __maintainer__ = 'Tyler Barrus' __email__ = 'barrust@gmail.com' __license__ = 'MIT' -__version__ = '0.1.5' +__version__ = '0.2.0' __credits__ = ['Peter Norvig'] __url__ = 'https://github.com/barrust/pyspellchecker' __bugtrack_url__ = '{0}/issues'.format(__url__) diff --git a/spellchecker/resources/pt.json.gz b/spellchecker/resources/pt.json.gz new file mode 100644 index 0000000..03fff2b Binary files /dev/null and b/spellchecker/resources/pt.json.gz differ diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py index 899f196..1f5184c 100644 --- a/spellchecker/spellchecker.py +++ b/spellchecker/spellchecker.py @@ -17,11 +17,13 @@ class SpellChecker(object): Args: language (str): The language of the dictionary to load or None \ - for no dictionary. Supported languages are `en`, `es`, `de`, and \ - `fr`. Defaults to `en` + for no dictionary. Supported languages are `en`, `es`, `de`, fr` \ + and `pt`. Defaults to `en` local_dictionary (str): The path to a locally stored word \ frequency dictionary; if provided, no language will be loaded - distance (int): The edit distance to use. Defaults to 2''' + distance (int): The edit distance to use. Defaults to 2 ''' + + __slots__ = ['_distance', '_word_frequency'] def __init__(self, language='en', local_dictionary=None, distance=2): self._distance = None @@ -77,7 +79,7 @@ def distance(self, val): self._distance = tmp @staticmethod - def words(text): + def split_words(text): ''' Split text into individual `words` using a simple whitespace regex Args: @@ -86,18 +88,19 @@ def words(text): list(str): A listing of all words in the provided text ''' return _words(text) - def export(self, filepath, gzipped=True): + def export(self, filepath, encoding='utf-8', gzipped=True): ''' Export the word frequency list for import in the future Args: filepath (str): The filepath to the exported dictionary + encoding (str): The encoding of the resulting output gzipped (bool): Whether to gzip the dictionary or not ''' data = json.dumps(self.word_frequency.dictionary, sort_keys=True) if gzipped: - with gzip.open(filepath, 'wt') as fobj: + with gzip.open(filepath, 'wt', encoding=encoding) as fobj: fobj.write(data) else: - with open(filepath, 'w') as fobj: + with open(filepath, 'w', encoding=encoding) as fobj: fobj.write(data) def word_probability(self, word, total_words=None): @@ -210,6 +213,7 @@ class WordFrequency(object): ''' Store the `dictionary` as a word frequency list while allowing for different methods to load the data and update over time ''' + __slots__ = ['_dictionary', '_total_words', '_unique_words', '_letters'] def __init__(self): self._dictionary = Counter() self._total_words = 0 @@ -278,27 +282,29 @@ def words(self): for word in self._dictionary.keys(): yield word - def load_dictionary(self, filename): + def load_dictionary(self, filename, encoding='utf-8'): ''' Load in a pre-built word frequency list Args: filename (str): The filepath to the json (optionally gzipped) \ - file to be loaded ''' + file to be loaded + encoding (str): The encoding of the dictionary ''' try: - with gzip.open(filename, 'rt') as fobj: + with gzip.open(filename, mode='rt', encoding=encoding) as fobj: data = fobj.read().lower() except OSError: - with open(filename, 'r') as fobj: + with open(filename, mode='r', encoding=encoding) as fobj: data = fobj.read().lower() - self._dictionary.update(json.loads(data, encoding='utf8')) + self._dictionary.update(json.loads(data, encoding=encoding)) self._update_dictionary() - def load_text_file(self, filename): + def load_text_file(self, filename, encoding='utf-8'): ''' Load in a text file from which to generate a word frequency list Args: - filename (str): The filepath to the text file to be loaded ''' - with open(filename, 'r') as fobj: + filename (str): The filepath to the text file to be loaded + encoding (str): The encoding of the text file ''' + with open(filename, 'r', encoding=encoding) as fobj: self.load_text(fobj.read()) def load_text(self, text): diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py index 5bac92f..823626a 100644 --- a/tests/spellchecker_test.py +++ b/tests/spellchecker_test.py @@ -36,7 +36,7 @@ def test_words(self): ''' rest the parsing of words ''' spell = SpellChecker() res = ['this', 'is', 'a', 'test', 'of', 'this'] - self.assertEqual(spell.words('This is a test of this'), res) + self.assertEqual(spell.split_words('This is a test of this'), res) def test_word_frequency(self): ''' test word frequency '''