Skip to content

Commit

Permalink
Merge pull request #66 from pdahale95/issue54
Browse files Browse the repository at this point in the history
add option to split words by space and treat any term with digit as an acronym
  • Loading branch information
mammothb authored Dec 16, 2019
2 parents 33305ee + e135881 commit b910712
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
21 changes: 17 additions & 4 deletions symspellpy/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def try_parse_int64(string):
return None
return None if ret < -2 ** 64 or ret >= 2 ** 64 else ret

def parse_words(phrase, preserve_case=False):
def parse_words(phrase, preserve_case=False, split_by_space=False):
"""Create a non-unique wordlist from sample text. Language
independent (e.g. works with Chinese characters)
Expand All @@ -121,34 +121,47 @@ def parse_words(phrase, preserve_case=False):
preserve_case : bool, optional
A flag to determine if we can to preserve the cases or convert
all to lowercase
split_by_space: bool, optional
Splits the phrase into words simply based on space
Returns
list
A list of words
"""

if split_by_space:
if preserve_case:
return phrase.split()
else:
return phrase.lower().split()

# \W non-words, use negated set to ignore non-words and "_"
# (underscore). Compatible with non-latin characters, does not
# split words at apostrophes

if preserve_case:
return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase)
else:
return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower())

def is_acronym(word):
def is_acronym(word, match_any_term_with_digits=False):
"""Checks is the word is all caps (acronym) and/or contain numbers
Parameters
----------
word : str
The word to check
match_any_term_with_digits: bool, optional
A flag to determine whether any term with digits
can be considered as acronym
Returns
-------
bool
True if the word is all caps and/or contain numbers, e.g.,
ABCDE, AB12C. False if the word contains lower case letters,
e.g., abcde, ABCde, abcDE, abCDe, abc12, ab12c
"""
if match_any_term_with_digits:
return any(i.isdigit() for i in word)
return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None

def transfer_casing_for_matching_text(text_w_casing, text_wo_casing):
Expand Down
15 changes: 10 additions & 5 deletions symspellpy/symspellpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,8 +715,8 @@ def early_exit():
return suggestions

def lookup_compound(self, phrase, max_edit_distance,
ignore_non_words=False,
transfer_casing=False):
ignore_non_words=False, transfer_casing=False,
split_phrase_by_space=False, ignore_any_term_with_digits=False):
"""`lookup_compound` supports compound aware automatic spelling
correction of multi-word input strings with three cases:
Expand All @@ -743,6 +743,11 @@ def lookup_compound(self, phrase, max_edit_distance,
transfer_casing : bool, optional
A flag to determine whether the casing --- i.e., uppercase
vs lowercase --- should be carried over from `phrase`.
split_by_space: bool, optional
Splits the phrase into words simply based on space
ignore_any_term_with_digits: bool, optional
A flag to determine whether any term with digits
is left alone during the spell checking process
Returns
-------
Expand All @@ -751,11 +756,11 @@ def lookup_compound(self, phrase, max_edit_distance,
representing suggested correct spellings for `phrase`.
"""
# Parse input string into single terms
term_list_1 = helpers.parse_words(phrase)
term_list_1 = helpers.parse_words(phrase, split_by_space = split_phrase_by_space)
# Second list of single terms with preserved cases so we can
# ignore acronyms (all cap words)
if ignore_non_words:
term_list_2 = helpers.parse_words(phrase, True)
term_list_2 = helpers.parse_words(phrase, preserve_case = True, split_by_space = split_phrase_by_space)
suggestions = list()
suggestion_parts = list()
distance_comparer = EditDistance(self._distance_algorithm)
Expand All @@ -768,7 +773,7 @@ def lookup_compound(self, phrase, max_edit_distance,
if helpers.try_parse_int64(term_list_1[i]) is not None:
suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
continue
if helpers.is_acronym(term_list_2[i]):
if helpers.is_acronym(term_list_2[i], match_any_term_with_digits = ignore_any_term_with_digits):
suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
continue
suggestions = self.lookup(term_list_1[i], Verbosity.TOP,
Expand Down
9 changes: 9 additions & 0 deletions test/test_symspellpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,15 @@ def test_lookup_compound_ignore_non_words(self):
results = sym_spell.lookup_compound(typo, edit_distance_max, True)
self.assertEqual(1, len(results))
self.assertEqual(correction, results[0].term)

typo = ("is the officeon 1st floor oepn 24/7")
correction = ("is the office on 1st floor open 24/7")
results = sym_spell.lookup_compound(typo, edit_distance_max, split_phrase_by_space=True,
ignore_non_words=True, ignore_any_term_with_digits=True)
self.assertEqual(1, len(results))
self.assertEqual(correction, results[0].term)
self.assertEqual(2, results[0].distance)
self.assertEqual(0, results[0].count)

def test_lookup_compound_ignore_non_words_no_bigram(self):
edit_distance_max = 2
Expand Down

0 comments on commit b910712

Please sign in to comment.