fix spellchecking punctuation and numbers

barrust · Jul 9, 2018 · c12bd4d · c12bd4d
1 parent 74fbc6b
commit c12bd4d
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # pyspellchecker
 
+## Version 0.1.3
+* Better handle punctuation and numbers as the word to check
+
 ## Version 0.1.1
 * Add support for language dictionaries
     * English, Spanish, French, and German

diff --git a/spellchecker/info.py b/spellchecker/info.py
@@ -5,7 +5,7 @@
 __maintainer__ = 'Tyler Barrus'
 __email__ = 'barrust@gmail.com'
 __license__ = 'MIT'
-__version__ = '0.1.2'
+__version__ = '0.1.3'
 __credits__ = ['Peter Norvig']
 __url__ = 'https://github.com/barrust/pyspellchecker'
 __bugtrack_url__ = '{0}/issues'.format(__url__)
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -6,6 +6,7 @@
 import re
 import json
 import gzip
+import string
 from collections import Counter
 
 
@@ -95,7 +96,7 @@ def candidates(self, word):
             Returns:
                 set: The set of words that are possible candidates '''
         return (self.known([word]) or self.known(self.edit_distance_1(word)) or
-                self.known(self.edit_distance_2(word)) or [word])
+                self.known(self.edit_distance_2(word)) or {word})
 
     def known(self, words):
         ''' The subset of `words` that appear in the dictionary of words
@@ -106,7 +107,8 @@ def known(self, words):
             Returns:
                 set: The set of those words from the input that are in the \
                 corpus '''
-        return set(w for w in words if w in self._word_frequency.dictionary)
+        return set(w for w in words if w in self._word_frequency.dictionary or
+                   not self._check_if_should_check(w))
 
     def unknown(self, words):
         ''' The subset of `words` that do not appear in the dictionary
@@ -117,7 +119,8 @@ def unknown(self, words):
             Returns:
                 set: The set of those words from the input that are not in \
                 the corpus '''
-        return set(w for w in words if w not in self._word_frequency.dictionary)
+        tmp = [w for w in words if self._check_if_should_check(w)]
+        return set(w for w in tmp if w not in self._word_frequency.dictionary)
 
     def edit_distance_1(self, word):
         ''' Compute all strings that are one edit away from `word` using only
@@ -128,6 +131,8 @@ def edit_distance_1(self, word):
             Returns:
                 set: The set of strings that are edit distance two from the \
                 provided word '''
+        if self._check_if_should_check(word) is False:
+            return {word}
         letters = self._word_frequency.letters
         splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
         deletes = [L + R[1:] for L, R in splits if R]
@@ -148,6 +153,17 @@ def edit_distance_2(self, word):
         return (e2 for e1 in self.edit_distance_1(word)
                 for e2 in self.edit_distance_1(e1))
 
+    @staticmethod
+    def _check_if_should_check(word):
+        if len(word) == 1 and word in string.punctuation:
+            return False
+        try:  # check if it is a number (int, float, etc)
+            float(word)
+            return False
+        except ValueError:
+            pass
+
+        return True
 
 class WordFrequency(object):
     ''' Store the `dictionary` as a word frequency list while allowing for

diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py
@@ -17,6 +17,9 @@ def test_correction(self):
         self.assertEqual(spell.correction('ergo'), 'ergo')
         self.assertEqual(spell.correction('alot'), 'a lot')
         self.assertEqual(spell.correction('this'), 'this')
+        self.assertEqual(spell.correction('-'), '-')
+        self.assertEqual(spell.correction('1213'), '1213')
+        self.assertEqual(spell.correction('1213.9'), '1213.9')
 
     def test_candidates(self):
         ''' test spell checker candidates '''
@@ -28,6 +31,7 @@ def test_candidates(self):
                  'whs', 'ghs', 'rhs', 'this'}
         self.assertEqual(spell.candidates('ths'), cands)
         self.assertEqual(spell.candidates('the'), {'the'})
+        self.assertEqual(spell.candidates('-'), {'-'})
 
     def test_words(self):
         ''' rest the parsing of words '''
@@ -56,6 +60,7 @@ def test_word_known(self):
         self.assertEqual(spell.known(['sherlock']), {'sherlock'})
         self.assertEqual(spell.known(['holmes']), {'holmes'})
         self.assertEqual(spell.known(['known']), {'known'})
+        self.assertEqual(spell.known(['-']), {'-'})
 
         self.assertEqual(spell.known(['foobar']), set())
         self.assertEqual(spell.known(['ths']), set())
@@ -68,6 +73,7 @@ def test_unknown_words(self):
         self.assertEqual(spell.unknown(['sherlock']), set())
         self.assertEqual(spell.unknown(['holmes']), set())
         self.assertEqual(spell.unknown(['known']), set())
+        self.assertEqual(spell.unknown(['-']), set())
 
         self.assertEqual(spell.unknown(['foobar']), {'foobar'})
         self.assertEqual(spell.unknown(['ths']), {'ths'})