Merge pull request #66 from pdahale95/issue54

add option to split words by space and treat any term with digit as an acronym
mammothb · Dec 16, 2019 · b910712 · b910712
2 parents 33305ee + e135881
commit b910712
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 9 deletions.
diff --git a/symspellpy/helpers.py b/symspellpy/helpers.py
@@ -110,7 +110,7 @@ def try_parse_int64(string):
         return None
     return None if ret < -2 ** 64 or ret >= 2 ** 64 else ret
 
-def parse_words(phrase, preserve_case=False):
+def parse_words(phrase, preserve_case=False, split_by_space=False):
     """Create a non-unique wordlist from sample text. Language
     independent (e.g. works with Chinese characters)
 
@@ -121,34 +121,47 @@ def parse_words(phrase, preserve_case=False):
     preserve_case : bool, optional
         A flag to determine if we can to preserve the cases or convert
         all to lowercase
-
+    split_by_space: bool, optional
+        Splits the phrase into words simply based on space
     Returns
     list
         A list of words
     """
+
+    if split_by_space:
+        if preserve_case:
+            return phrase.split()
+        else:
+            return phrase.lower().split()
+
     # \W non-words, use negated set to ignore non-words and "_"
     # (underscore). Compatible with non-latin characters, does not
     # split words at apostrophes
+
     if preserve_case:
         return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase)
     else:
         return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower())
 
-def is_acronym(word):
+def is_acronym(word, match_any_term_with_digits=False):
     """Checks is the word is all caps (acronym) and/or contain numbers
 
     Parameters
     ----------
     word : str
         The word to check
-
+    match_any_term_with_digits: bool, optional
+        A flag to determine whether any term with digits
+        can be considered as acronym
     Returns
     -------
     bool
         True if the word is all caps and/or contain numbers, e.g.,
         ABCDE, AB12C. False if the word contains lower case letters,
         e.g., abcde, ABCde, abcDE, abCDe, abc12, ab12c
     """
+    if match_any_term_with_digits:
+        return any(i.isdigit() for i in word)
     return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None
 
 def transfer_casing_for_matching_text(text_w_casing, text_wo_casing):

diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py
@@ -715,8 +715,8 @@ def early_exit():
         return suggestions
 
     def lookup_compound(self, phrase, max_edit_distance,
-                        ignore_non_words=False,
-                        transfer_casing=False):
+                        ignore_non_words=False, transfer_casing=False,
+                        split_phrase_by_space=False, ignore_any_term_with_digits=False):
         """`lookup_compound` supports compound aware automatic spelling
         correction of multi-word input strings with three cases:
 
@@ -743,6 +743,11 @@ def lookup_compound(self, phrase, max_edit_distance,
         transfer_casing : bool, optional
             A flag to determine whether the casing --- i.e., uppercase
             vs lowercase --- should be carried over from `phrase`.
+        split_by_space: bool, optional
+            Splits the phrase into words simply based on space
+        ignore_any_term_with_digits: bool, optional
+            A flag to determine whether any term with digits
+            is left alone during the spell checking process
 
         Returns
         -------
@@ -751,11 +756,11 @@ def lookup_compound(self, phrase, max_edit_distance,
             representing suggested correct spellings for `phrase`.
         """
         # Parse input string into single terms
-        term_list_1 = helpers.parse_words(phrase)
+        term_list_1 = helpers.parse_words(phrase, split_by_space = split_phrase_by_space)
         # Second list of single terms with preserved cases so we can
         # ignore acronyms (all cap words)
         if ignore_non_words:
-            term_list_2 = helpers.parse_words(phrase, True)
+            term_list_2 = helpers.parse_words(phrase, preserve_case = True, split_by_space = split_phrase_by_space)
         suggestions = list()
         suggestion_parts = list()
         distance_comparer = EditDistance(self._distance_algorithm)
@@ -768,7 +773,7 @@ def lookup_compound(self, phrase, max_edit_distance,
                 if helpers.try_parse_int64(term_list_1[i]) is not None:
                     suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
                     continue
-                if helpers.is_acronym(term_list_2[i]):
+                if helpers.is_acronym(term_list_2[i], match_any_term_with_digits = ignore_any_term_with_digits):
                     suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
                     continue
             suggestions = self.lookup(term_list_1[i], Verbosity.TOP,

diff --git a/test/test_symspellpy.py b/test/test_symspellpy.py
@@ -725,6 +725,15 @@ def test_lookup_compound_ignore_non_words(self):
         results = sym_spell.lookup_compound(typo, edit_distance_max, True)
         self.assertEqual(1, len(results))
         self.assertEqual(correction, results[0].term)
+
+        typo = ("is the officeon 1st floor oepn 24/7")
+        correction = ("is the office on 1st floor open 24/7")
+        results = sym_spell.lookup_compound(typo, edit_distance_max, split_phrase_by_space=True, 
+        ignore_non_words=True, ignore_any_term_with_digits=True)
+        self.assertEqual(1, len(results))
+        self.assertEqual(correction, results[0].term)
+        self.assertEqual(2, results[0].distance)
+        self.assertEqual(0, results[0].count)
 
     def test_lookup_compound_ignore_non_words_no_bigram(self):
         edit_distance_max = 2