Check errors don't exist as valid words in the aspell dictionary (#1142)

* Check errors don't exist as valid words in the aspell dictionary * Install aspell on Travis * Add some missing packages * Remove a virtual package * Just install the version of aspell-python we need * Keep flake8 happy * Switch to warnings and count them, so we can see all the aspell errors in one go * Handle different encoding of the word and aspell * Try and fix the encoding conversion * Find out the encoding type * Don't assert on number of warnings * Don't record warnings for now * Warn on all the encoding options * pprint the encoding * More warning work * Use the actual encoding type * Correct the logic * ENH: Multi dict support * FIX: Fixes after merge * FIX: Better error check * FIX: More thorough testing, locations * FIX: Try newer aspell * FIX: Move to new dict * FIX: Move * FIX: Restore removals from #1181 * FIX: One from #1362 * Add rare chack->check, cheque, * Minor tidy of some dictionary check code * Add some more suggestions. * Fix the whitespace * Really fix the whitespace * FIX: Refactor requirement * Log an error when aspell not found and not required * Fix the error logging * Test all variants of present and missing from aspell * Undo some tuple tidying * Fix the true/false values used * Skip some flake8 tests * Fix the test cases * Correct the not in aspell test and fix some test cases * Remove a duplicate test * Use a test word that isn't a typo * Set the ideal aspell detection logic for each dictionary I suspect we'll have to relax this as more obscure words won't be in the aspell dictionary * Be more realistic given the size of the dictionary * Fix a flake8 error * Fix another line length error * FIX: Move * FIX: Make visible, simplify Co-authored-by: Eric Larson <larson.eric.d@gmail.com>
codespell-project · Apr 6, 2020 · d978da6 · d978da6
1 parent 8d99c0e
commit d978da6
Show file tree

Hide file tree

Showing 11 changed files with 428 additions and 243 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,6 +4,7 @@
 # for it to be on multiple physical lines, so long as you remember: - There
 # can't be any leading "-"s - All newlines will be removed, so use ";"s
 
+dist: bionic
 language: python
 cache: pip
 python:
@@ -12,6 +13,14 @@ python:
   - 3.6
   - 3.7
   - 3.8
+env:
+  REQUIRE_ASPELL=true
+
+addons:
+  apt:
+    packages:
+      - libaspell-dev
+      - aspell-en
 
 before_install:
     - source tools/travis_tools.sh
@@ -22,6 +31,8 @@ before_install:
     - python --version  # just to check
     - pip install -U pip wheel  # upgrade to latest pip find 3.5 wheels; wheel to avoid errors
     - retry pip install pytest pytest-cov flake8 coverage codecov chardet setuptools docutils
+    - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then retry pip install aspell-python-py2; fi
+    - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ]; then retry pip install aspell-python-py3; fi
     - cd $SRC_DIR
 
 install:

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
@@ -35,8 +35,18 @@
 
 # Users might want to link this file into /usr/local/bin, so we resolve the
 # symbolic link path to the real path if necessary.
-default_dictionary = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                  'data', 'dictionary.txt')
+_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
+_builtin_dictionaries = (  # name, desc, name, err in aspell, correction in aspell  # noqa: E501
+# The aspell tests here aren't the ideal state, but the None's are realistic
+# for obscure words
+    ('clear', 'for unambiguous errors', '', False, None),
+    ('rare', 'for rare but valid words', '_rare', None, None),
+    ('informal', 'for informal words', '_informal', True, True),
+    ('code', 'for words common to code and/or mathematics', '_code', None, None),  # noqa: E501
+    ('names', 'for valid proper names that might be typos', '_names', None, None),  # noqa: E501
+    ('en-GB_to_en-US', 'for corrections from en-GB to en-US', '_en-GB_to_en-US', True, True),  # noqa: E501
+)
+_builtin_default = 'clear,rare'
 
 # OPTIONS:
 #
@@ -216,11 +226,21 @@ def parse_options(args):
                         help='write changes in place if possible')
 
     parser.add_argument('-D', '--dictionary',
-                        action='append', metavar='FILE',
+                        action='append',
                         help='Custom dictionary file that contains spelling '
                              'corrections. If this flag is not specified or '
                              'equals "-" then the default dictionary is used. '
                              'This option can be specified multiple times.')
+    builtin_opts = ', '.join(
+        '%r %s' % (d[0], d[1]) for d in _builtin_dictionaries)
+    parser.add_argument('--builtin',
+                        dest='builtin', default=_builtin_default,
+                        metavar='BUILTIN-LIST',
+                        help='Comma-separated list of builtin dictionaries '
+                        'to include (when "-D -" or no "-D" is passed). '
+                        'Current options are:\n%s. The default is '
+                        '"--builtin %s".'
+                        % (builtin_opts, _builtin_default))
     parser.add_argument('-I', '--ignore-words',
                         action='append', metavar='FILE',
                         help='File that contains words which will be ignored '
@@ -603,7 +623,7 @@ def main(*args):
     ignore_words_files = options.ignore_words or []
     ignore_words = set()
     for ignore_words_file in ignore_words_files:
-        if not os.path.exists(ignore_words_file):
+        if not os.path.isfile(ignore_words_file):
             print('ERROR: cannot find ignore-words file: %s' %
                   ignore_words_file, file=sys.stderr)
             parser.print_help()
@@ -615,16 +635,36 @@ def main(*args):
         for word in comma_separated_words.split(','):
             ignore_words.add(word.strip())
 
-    dictionaries = options.dictionary or [default_dictionary]
-    misspellings = dict()
+    if options.dictionary:
+        dictionaries = options.dictionary
+    else:
+        dictionaries = ['-']
+    use_dictionaries = list()
     for dictionary in dictionaries:
         if dictionary == "-":
-            dictionary = default_dictionary
-        if not os.path.exists(dictionary):
-            print('ERROR: cannot find dictionary file: %s' % dictionary,
-                  file=sys.stderr)
-            parser.print_help()
-            return 1
+            # figure out which builtin dictionaries to use
+            use = sorted(set(options.builtin.split(',')))
+            for u in use:
+                for builtin in _builtin_dictionaries:
+                    if builtin[0] == u:
+                        use_dictionaries.append(
+                            os.path.join(_data_root, 'dictionary%s.txt'
+                                         % (builtin[2],)))
+                        break
+                else:
+                    print('ERROR: Unknown builtin dictionary: %s' % (u,),
+                          file=sys.stderr)
+                    parser.print_help()
+                    return 1
+        else:
+            if not os.path.isfile(dictionary):
+                print('ERROR: cannot find dictionary file: %s' % dictionary,
+                      file=sys.stderr)
+                parser.print_help()
+                return 1
+            use_dictionaries.append(dictionary)
+    misspellings = dict()
+    for dictionary in use_dictionaries:
         build_dict(dictionary, misspellings, ignore_words)
     colors = TermColors()
     if not options.colors or sys.platform == 'win32':