add Portugues; encoding; __slots__

barrust · Nov 6, 2018 · 1603837 · 1603837
1 parent 01a5970
commit 1603837
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 19 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,7 +9,7 @@ install:
   - pip install -r requirements/requirements-dev.txt
 
 script:
-    - coverage run --source=spellchecker setup.py test
+  - coverage run --source=spellchecker setup.py test
 
 # commands to run after the tests successfully complete
 after_success:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # pyspellchecker
 
+## Version 0.2.0
+* Changed `words` function to `split_words` to differentiate with the `word_frequency.words` function
+* Added ***Portuguese*** dictionary: `pt`
+* Add encoding argument to `gzip.open` and `open` dictionary loading and exporting
+* Use of __slots__ for class objects
+
 ## Version 0.1.5
 * Remove words based on threshold
 * Add ability to iterate over words (keys) in the dictionary

diff --git a/README.rst b/README.rst
@@ -21,7 +21,8 @@ list. Those words that are found more often in the frequency list are
 **more likely** the correct results.
 
 ``pyspellchecker`` supports multiple languages including English, Spanish,
-German, and French. Dictionaries were generated using the `WordFrequency project <https://github.com/hermitdave/FrequencyWords>`__ on GitHub.
+German, French, and Portuguese. Dictionaries were generated using
+the `WordFrequency project <https://github.com/hermitdave/FrequencyWords>`__ on GitHub.
 
 ``pyspellchecker`` supports **Python 3**. If may work for Python 2.7 but it is not
 guaranteed (especially for Non-English dictionaries)!

diff --git a/spellchecker/info.py b/spellchecker/info.py
@@ -5,7 +5,7 @@
 __maintainer__ = 'Tyler Barrus'
 __email__ = 'barrust@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.1.5'
+__version__ = '0.2.0'
 __credits__ = ['Peter Norvig']
 __url__ = 'https://github.com/barrust/pyspellchecker'
 __bugtrack_url__ = '{0}/issues'.format(__url__)
diff --git a/spellchecker/resources/pt.json.gz b/spellchecker/resources/pt.json.gz
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -17,11 +17,13 @@ class SpellChecker(object):
 
         Args:
             language (str): The language of the dictionary to load or None \
-            for no dictionary. Supported languages are `en`, `es`, `de`, and \
-            `fr`. Defaults to `en`
+            for no dictionary. Supported languages are `en`, `es`, `de`, fr` \
+            and `pt`. Defaults to `en`
             local_dictionary (str): The path to a locally stored word \
             frequency dictionary; if provided, no language will be loaded
-            distance (int): The edit distance to use. Defaults to 2'''
+            distance (int): The edit distance to use. Defaults to 2 '''
+
+    __slots__ = ['_distance', '_word_frequency']
 
     def __init__(self, language='en', local_dictionary=None, distance=2):
         self._distance = None
@@ -77,7 +79,7 @@ def distance(self, val):
         self._distance = tmp
 
     @staticmethod
-    def words(text):
+    def split_words(text):
         ''' Split text into individual `words` using a simple whitespace regex
 
             Args:
@@ -86,18 +88,19 @@ def words(text):
                 list(str): A listing of all words in the provided text '''
         return _words(text)
 
-    def export(self, filepath, gzipped=True):
+    def export(self, filepath, encoding='utf-8', gzipped=True):
         ''' Export the word frequency list for import in the future
 
              Args:
                 filepath (str): The filepath to the exported dictionary
+                encoding (str): The encoding of the resulting output
                 gzipped (bool): Whether to gzip the dictionary or not '''
         data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
         if gzipped:
-            with gzip.open(filepath, 'wt') as fobj:
+            with gzip.open(filepath, 'wt', encoding=encoding) as fobj:
                 fobj.write(data)
         else:
-            with open(filepath, 'w') as fobj:
+            with open(filepath, 'w', encoding=encoding) as fobj:
                 fobj.write(data)
 
     def word_probability(self, word, total_words=None):
@@ -210,6 +213,7 @@ class WordFrequency(object):
     ''' Store the `dictionary` as a word frequency list while allowing for
         different methods to load the data and update over time '''
 
+    __slots__ = ['_dictionary', '_total_words', '_unique_words', '_letters']
     def __init__(self):
         self._dictionary = Counter()
         self._total_words = 0
@@ -278,27 +282,29 @@ def words(self):
         for word in self._dictionary.keys():
             yield word
 
-    def load_dictionary(self, filename):
+    def load_dictionary(self, filename, encoding='utf-8'):
         ''' Load in a pre-built word frequency list
 
             Args:
                 filename (str): The filepath to the json (optionally gzipped) \
-                file to be loaded '''
+                file to be loaded
+                encoding (str): The encoding of the dictionary '''
         try:
-            with gzip.open(filename, 'rt') as fobj:
+            with gzip.open(filename, mode='rt', encoding=encoding) as fobj:
                 data = fobj.read().lower()
         except OSError:
-            with open(filename, 'r') as fobj:
+            with open(filename, mode='r', encoding=encoding) as fobj:
                 data = fobj.read().lower()
-        self._dictionary.update(json.loads(data, encoding='utf8'))
+        self._dictionary.update(json.loads(data, encoding=encoding))
         self._update_dictionary()
 
-    def load_text_file(self, filename):
+    def load_text_file(self, filename, encoding='utf-8'):
         ''' Load in a text file from which to generate a word frequency list
 
             Args:
-                filename (str): The filepath to the text file to be loaded '''
-        with open(filename, 'r') as fobj:
+                filename (str): The filepath to the text file to be loaded
+                encoding (str): The encoding of the text file '''
+        with open(filename, 'r', encoding=encoding) as fobj:
             self.load_text(fobj.read())
 
     def load_text(self, text):

diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py
@@ -36,7 +36,7 @@ def test_words(self):
         ''' rest the parsing of words '''
         spell = SpellChecker()
         res = ['this', 'is', 'a', 'test', 'of', 'this']
-        self.assertEqual(spell.words('This is a test of this'), res)
+        self.assertEqual(spell.split_words('This is a test of this'), res)
 
     def test_word_frequency(self):
         ''' test word frequency '''