gh-89973: Fix re.error in the fnmatch module. (GH-93072)

serhiy-storchaka · web-flow · commit 0902c3d8edf7 · 2022-06-05T11:46:29.000+03:00
Character ranges with upper bound less that lower bound (e.g. [c-a])
are now interpreted as empty ranges, for compatibility with other glob
pattern implementations. Previously it was re.error.
diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py
@@ -102,7 +102,7 @@ def translate(pat):
                 add('\\[')
             else:
                 stuff = pat[i:j]
-                if '--' not in stuff:
+                if '-' not in stuff:
                     stuff = stuff.replace('\\', r'\\')
                 else:
                     chunks = []
@@ -114,19 +114,35 @@ def translate(pat):
                         chunks.append(pat[i:k])
                         i = k+1
                         k = k+3
-                    chunks.append(pat[i:j])
+                    chunk = pat[i:j]
+                    if chunk:
+                        chunks.append(chunk)
+                    else:
+                        chunks[-1] += '-'
+                    # Remove empty ranges -- invalid in RE.
+                    for k in range(len(chunks)-1, 0, -1):
+                        if chunks[k-1][-1] > chunks[k][0]:
+                            chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:]
+                            del chunks[k]
                     # Escape backslashes and hyphens for set difference (--).
                     # Hyphens that create ranges shouldn't be escaped.
                     stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
                                      for s in chunks)
                 # Escape set operations (&&, ~~ and ||).
                 stuff = re.sub(r'([&~|])', r'\\\1', stuff)
                 i = j+1
-                if stuff[0] == '!':
-                    stuff = '^' + stuff[1:]
-                elif stuff[0] in ('^', '['):
-                    stuff = '\\' + stuff
-                add(f'[{stuff}]')
+                if not stuff:
+                    # Empty range: never match.
+                    add('(?!)')
+                elif stuff == '!':
+                    # Negated empty range: match any character.
+                    add('.')
+                else:
+                    if stuff[0] == '!':
+                        stuff = '^' + stuff[1:]
+                    elif stuff[0] in ('^', '['):
+                        stuff = '\\' + stuff
+                    add(f'[{stuff}]')
         else:
             add(re.escape(c))
     assert i == n
diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py
@@ -2,6 +2,7 @@
 
 import unittest
 import os
+import string
 import warnings
 
 from fnmatch import fnmatch, fnmatchcase, translate, filter
@@ -91,6 +92,119 @@ def test_sep(self):
         check('usr/bin', 'usr\\bin', normsep)
         check('usr\\bin', 'usr\\bin')
 
+    def test_char_set(self):
+        ignorecase = os.path.normcase('ABC') == os.path.normcase('abc')
+        check = self.check_match
+        tescases = string.ascii_lowercase + string.digits + string.punctuation
+        for c in tescases:
+            check(c, '[az]', c in 'az')
+            check(c, '[!az]', c not in 'az')
+        # Case insensitive.
+        for c in tescases:
+            check(c, '[AZ]', (c in 'az') and ignorecase)
+            check(c, '[!AZ]', (c not in 'az') or not ignorecase)
+        for c in string.ascii_uppercase:
+            check(c, '[az]', (c in 'AZ') and ignorecase)
+            check(c, '[!az]', (c not in 'AZ') or not ignorecase)
+        # Repeated same character.
+        for c in tescases:
+            check(c, '[aa]', c == 'a')
+        # Special cases.
+        for c in tescases:
+            check(c, '[^az]', c in '^az')
+            check(c, '[[az]', c in '[az')
+            check(c, r'[!]]', c != ']')
+        check('[', '[')
+        check('[]', '[]')
+        check('[!', '[!')
+        check('[!]', '[!]')
+
+    def test_range(self):
+        ignorecase = os.path.normcase('ABC') == os.path.normcase('abc')
+        normsep = os.path.normcase('\\') == os.path.normcase('/')
+        check = self.check_match
+        tescases = string.ascii_lowercase + string.digits + string.punctuation
+        for c in tescases:
+            check(c, '[b-d]', c in 'bcd')
+            check(c, '[!b-d]', c not in 'bcd')
+            check(c, '[b-dx-z]', c in 'bcdxyz')
+            check(c, '[!b-dx-z]', c not in 'bcdxyz')
+        # Case insensitive.
+        for c in tescases:
+            check(c, '[B-D]', (c in 'bcd') and ignorecase)
+            check(c, '[!B-D]', (c not in 'bcd') or not ignorecase)
+        for c in string.ascii_uppercase:
+            check(c, '[b-d]', (c in 'BCD') and ignorecase)
+            check(c, '[!b-d]', (c not in 'BCD') or not ignorecase)
+        # Upper bound == lower bound.
+        for c in tescases:
+            check(c, '[b-b]', c == 'b')
+        # Special cases.
+        for c in tescases:
+            check(c, '[!-#]', c not in '-#')
+            check(c, '[!--.]', c not in '-.')
+            check(c, '[^-`]', c in '^_`')
+            if not (normsep and c == '/'):
+                check(c, '[[-^]', c in r'[\]^')
+                check(c, r'[\-^]', c in r'\]^')
+            check(c, '[b-]', c in '-b')
+            check(c, '[!b-]', c not in '-b')
+            check(c, '[-b]', c in '-b')
+            check(c, '[!-b]', c not in '-b')
+            check(c, '[-]', c in '-')
+            check(c, '[!-]', c not in '-')
+        # Upper bound is less that lower bound: error in RE.
+        for c in tescases:
+            check(c, '[d-b]', False)
+            check(c, '[!d-b]', True)
+            check(c, '[d-bx-z]', c in 'xyz')
+            check(c, '[!d-bx-z]', c not in 'xyz')
+            check(c, '[d-b^-`]', c in '^_`')
+            if not (normsep and c == '/'):
+                check(c, '[d-b[-^]', c in r'[\]^')
+
+    def test_sep_in_char_set(self):
+        normsep = os.path.normcase('\\') == os.path.normcase('/')
+        check = self.check_match
+        check('/', r'[/]')
+        check('\\', r'[\]')
+        check('/', r'[\]', normsep)
+        check('\\', r'[/]', normsep)
+        check('[/]', r'[/]', False)
+        check(r'[\\]', r'[/]', False)
+        check('\\', r'[\t]')
+        check('/', r'[\t]', normsep)
+        check('t', r'[\t]')
+        check('\t', r'[\t]', False)
+
+    def test_sep_in_range(self):
+        normsep = os.path.normcase('\\') == os.path.normcase('/')
+        check = self.check_match
+        check('a/b', 'a[.-0]b', not normsep)
+        check('a\\b', 'a[.-0]b', False)
+        check('a\\b', 'a[Z-^]b', not normsep)
+        check('a/b', 'a[Z-^]b', False)
+
+        check('a/b', 'a[/-0]b', not normsep)
+        check(r'a\b', 'a[/-0]b', False)
+        check('a[/-0]b', 'a[/-0]b', False)
+        check(r'a[\-0]b', 'a[/-0]b', False)
+
+        check('a/b', 'a[.-/]b')
+        check(r'a\b', 'a[.-/]b', normsep)
+        check('a[.-/]b', 'a[.-/]b', False)
+        check(r'a[.-\]b', 'a[.-/]b', False)
+
+        check(r'a\b', r'a[\-^]b')
+        check('a/b', r'a[\-^]b', normsep)
+        check(r'a[\-^]b', r'a[\-^]b', False)
+        check('a[/-^]b', r'a[\-^]b', False)
+
+        check(r'a\b', r'a[Z-\]b', not normsep)
+        check('a/b', r'a[Z-\]b', False)
+        check(r'a[Z-\]b', r'a[Z-\]b', False)
+        check('a[Z-/]b', r'a[Z-\]b', False)
+
     def test_warnings(self):
         with warnings.catch_warnings():
             warnings.simplefilter('error', Warning)
diff --git a/Misc/NEWS.d/next/Library/2022-05-22-16-08-01.gh-issue-89973.jc-Q4g.rst b/Misc/NEWS.d/next/Library/2022-05-22-16-08-01.gh-issue-89973.jc-Q4g.rst
@@ -0,0 +1,3 @@
+Fix :exc:`re.error` raised in :mod:`fnmatch` if the pattern contains a
+character range with upper bound lower than lower bound (e.g. ``[c-a]``).
+Now such ranges are interpreted as empty ranges.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Fix :exc:`re.error` raised in :mod:`fnmatch` if the pattern contains a
	`2`	+character range with upper bound lower than lower bound (e.g. ``[c-a]``).
	`3`	`+Now such ranges are interpreted as empty ranges.`