Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix tokenising when using using more than just a-zA-Z #37

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
/MANIFEST
__pycache__/
*.pyc
test.cram.err
20 changes: 10 additions & 10 deletions scspell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

import argparse
import os
import re
import regex
import sys
import shutil
import uuid
Expand Down Expand Up @@ -78,22 +78,22 @@
# Treat anything alphanumeric as a token of interest, as long as it is not
# immediately preceded by a single backslash. (The string "\ntext" should
# match on "text" rather than "ntext".)
C_ESCAPE_TOKEN_REGEX = re.compile(r'(?<![^\\]\\)\w+')
C_ESCAPE_TOKEN_REGEX = regex.compile(r'(?<![^\\]\\)\w+')

# \ is not a character escape in e.g. LaTeX
TOKEN_REGEX = re.compile(r'\w+')
TOKEN_REGEX = regex.compile(r'\w+')

# Hex digits will be treated as a special case, because they can look like
# word-like even though they are actually numeric
HEX_REGEX = re.compile(r'0x[0-9a-fA-F]+')
HEX_REGEX = regex.compile(r'0x[0-9a-fA-F]+')

# We assume that tokens will be split using either underscores,
# digits, or camelCase conventions (or both)
US_REGEX = re.compile(r'[_\d]+')
CAMEL_WORD_REGEX = re.compile(r'([A-Z][a-z]*)')
US_REGEX = regex.compile(r'[_\d]+')
CAMEL_WORD_REGEX = regex.compile(r'([[:upper:]][[:lower:]]*)')

# File-id specifiers take this form
FILE_ID_REGEX = re.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')
FILE_ID_REGEX = regex.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')


class MatchDescriptor(object):
Expand Down Expand Up @@ -384,7 +384,7 @@ def handle_failed_check_interactively(
print("%s:%u: Unmatched '%s' --> {%s}" %
(filename, match_desc.get_line_num(), token,
', '.join([st for st in unmatched_subtokens])))
MATCH_REGEX = re.compile(re.escape(match_desc.get_token()))
MATCH_REGEX = regex.compile(regex.escape(match_desc.get_token()))
while True:
print("""\
(i)gnore, (I)gnore all, (r)eplace, (R)eplace all, (a)dd to dictionary, or
Expand All @@ -405,7 +405,7 @@ def handle_failed_check_interactively(
(Canceled.)\n""")
else:
ignores.add(replacement.lower())
tail = re.sub(
tail = regex.sub(
MATCH_REGEX, replacement, match_desc.get_remainder(),
1 if ch == 'r' else 0)
print()
Expand Down Expand Up @@ -771,7 +771,7 @@ def add_to_dict(dictionary_type, word, files=[],
dicts.add_by_file_id(word, file_id)

elif dictionary_type[0] == 'p':
ext = re.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
ext = regex.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
if not dicts.add_by_extension(word, ext):
print("Dictionary for file extension '{}' not found."
.format(ext), file=sys.stderr)
Expand Down
4 changes: 2 additions & 2 deletions scspell/_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import io
import json
import os
import re
import regex
import sys
from bisect import bisect_left
from . import _util
Expand All @@ -41,7 +41,7 @@


# Valid file ID strings take this form
FILE_ID_REGEX = re.compile(r'[a-zA-Z0-9_\-]+')
FILE_ID_REGEX = regex.compile(r'[a-zA-Z0-9_\-]+')


MATCH_NATURAL = 0x1
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ def get_version():
'Topic :: Software Development',
'Topic :: Text Processing :: Linguistic',
'Topic :: Utilities'],
platforms=['any']
platforms=['any'],
install_requires=['regex']
)
9 changes: 8 additions & 1 deletion test.cram
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ Test okay file.
$ echo 'This is okay.' > good.txt
$ $SCSPELL good.txt


Test file with --override-dictionary and a fileid mapping entry

$ cp -a "$TESTDIR/tests" .
Expand All @@ -27,6 +26,14 @@ Test file with --override-dictionary and a fileid mapping entry
tests/fileidmap/inputfile2.txt:4: 'soem' not found in dictionary (from token 'soem')
[1]

Test spelling mistake with diacritics.

$ $SCSPELL 'tests/basedicts/unicode-testfile'
tests/basedicts/unicode-testfile:1: 'b\xe4dly' not found in dictionary (from token 'B\xe4dly')
tests/basedicts/unicode-testfile:1: '\xe1lmost' not found in dictionary (from token '\xc1lmost')
tests/basedicts/unicode-testfile:1: '\xe7\xe5m\xe9l', '\xe7\xe4se' were not found in the dictionary (from token '\xc7\xe5m\xe9l\xc7\xe4se')
[1]

Test file ID manipulations

$ $SCSPELL --override-dictionary tests/fileidmap/dictionary \
Expand Down
1 change: 1 addition & 0 deletions tests/basedicts/unicode-testfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Bädly Álmost ÇåmélÇäse