From c12bd4d3c13896d4fc86e3274301278d522ee633 Mon Sep 17 00:00:00 2001 From: Tyler Barrus Date: Mon, 9 Jul 2018 19:18:44 -0400 Subject: [PATCH] fix spellchecking punctuation and numbers --- CHANGELOG.md | 3 +++ spellchecker/info.py | 2 +- spellchecker/spellchecker.py | 22 +++++++++++++++++++--- tests/spellchecker_test.py | 6 ++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fef005c..8bac588 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # pyspellchecker +## Version 0.1.3 +* Better handle punctuation and numbers as the word to check + ## Version 0.1.1 * Add support for language dictionaries * English, Spanish, French, and German diff --git a/spellchecker/info.py b/spellchecker/info.py index 6cf9579..2ebc19d 100644 --- a/spellchecker/info.py +++ b/spellchecker/info.py @@ -5,7 +5,7 @@ __maintainer__ = 'Tyler Barrus' __email__ = 'barrust@gmail.com' __license__ = 'MIT' -__version__ = '0.1.2' +__version__ = '0.1.3' __credits__ = ['Peter Norvig'] __url__ = 'https://github.com/barrust/pyspellchecker' __bugtrack_url__ = '{0}/issues'.format(__url__) diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py index 30ab425..e10a734 100644 --- a/spellchecker/spellchecker.py +++ b/spellchecker/spellchecker.py @@ -6,6 +6,7 @@ import re import json import gzip +import string from collections import Counter @@ -95,7 +96,7 @@ def candidates(self, word): Returns: set: The set of words that are possible candidates ''' return (self.known([word]) or self.known(self.edit_distance_1(word)) or - self.known(self.edit_distance_2(word)) or [word]) + self.known(self.edit_distance_2(word)) or {word}) def known(self, words): ''' The subset of `words` that appear in the dictionary of words @@ -106,7 +107,8 @@ def known(self, words): Returns: set: The set of those words from the input that are in the \ corpus ''' - return set(w for w in words if w in self._word_frequency.dictionary) + return set(w for w in words if w in self._word_frequency.dictionary or + not self._check_if_should_check(w)) def unknown(self, words): ''' The subset of `words` that do not appear in the dictionary @@ -117,7 +119,8 @@ def unknown(self, words): Returns: set: The set of those words from the input that are not in \ the corpus ''' - return set(w for w in words if w not in self._word_frequency.dictionary) + tmp = [w for w in words if self._check_if_should_check(w)] + return set(w for w in tmp if w not in self._word_frequency.dictionary) def edit_distance_1(self, word): ''' Compute all strings that are one edit away from `word` using only @@ -128,6 +131,8 @@ def edit_distance_1(self, word): Returns: set: The set of strings that are edit distance two from the \ provided word ''' + if self._check_if_should_check(word) is False: + return {word} letters = self._word_frequency.letters splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] @@ -148,6 +153,17 @@ def edit_distance_2(self, word): return (e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)) + @staticmethod + def _check_if_should_check(word): + if len(word) == 1 and word in string.punctuation: + return False + try: # check if it is a number (int, float, etc) + float(word) + return False + except ValueError: + pass + + return True class WordFrequency(object): ''' Store the `dictionary` as a word frequency list while allowing for diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py index f1b7742..c5b77f1 100644 --- a/tests/spellchecker_test.py +++ b/tests/spellchecker_test.py @@ -17,6 +17,9 @@ def test_correction(self): self.assertEqual(spell.correction('ergo'), 'ergo') self.assertEqual(spell.correction('alot'), 'a lot') self.assertEqual(spell.correction('this'), 'this') + self.assertEqual(spell.correction('-'), '-') + self.assertEqual(spell.correction('1213'), '1213') + self.assertEqual(spell.correction('1213.9'), '1213.9') def test_candidates(self): ''' test spell checker candidates ''' @@ -28,6 +31,7 @@ def test_candidates(self): 'whs', 'ghs', 'rhs', 'this'} self.assertEqual(spell.candidates('ths'), cands) self.assertEqual(spell.candidates('the'), {'the'}) + self.assertEqual(spell.candidates('-'), {'-'}) def test_words(self): ''' rest the parsing of words ''' @@ -56,6 +60,7 @@ def test_word_known(self): self.assertEqual(spell.known(['sherlock']), {'sherlock'}) self.assertEqual(spell.known(['holmes']), {'holmes'}) self.assertEqual(spell.known(['known']), {'known'}) + self.assertEqual(spell.known(['-']), {'-'}) self.assertEqual(spell.known(['foobar']), set()) self.assertEqual(spell.known(['ths']), set()) @@ -68,6 +73,7 @@ def test_unknown_words(self): self.assertEqual(spell.unknown(['sherlock']), set()) self.assertEqual(spell.unknown(['holmes']), set()) self.assertEqual(spell.unknown(['known']), set()) + self.assertEqual(spell.unknown(['-']), set()) self.assertEqual(spell.unknown(['foobar']), {'foobar'}) self.assertEqual(spell.unknown(['ths']), {'ths'})