Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters #24843

Merged
merged 1 commit into from
Mar 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 60 additions & 26 deletions Lib/test/test_cmd_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,38 +200,72 @@ def test_undecodable_code(self):
if not stdout.startswith(pattern):
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))

@unittest.skipIf(sys.platform == 'win32',
'Windows has a native unicode API')
def test_invalid_utf8_arg(self):
# bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
# byte sequence with surrogateescape rather than decoding it as the
# U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
# Python Unicode characters.
#
# Test with default config, in the C locale, in the Python UTF-8 Mode.
code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
base_cmd = [sys.executable, '-c', code]

def run_default(arg):
cmd = [sys.executable, '-c', code, arg]
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)

def run_c_locale(arg):
cmd = [sys.executable, '-c', code, arg]
env = dict(os.environ)
env['LC_ALL'] = 'C'
return subprocess.run(cmd, stdout=subprocess.PIPE,
text=True, env=env)

def run_utf8_mode(arg):
cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)

valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
# invalid UTF-8 byte sequences with a valid UTF-8 sequence
# in the middle.
invalid_utf8 = (
b'\xff' # invalid byte
b'\xc3\xff' # invalid byte sequence
b'\xc3\xa9' # valid utf-8: U+00E9 character
b'\xed\xa0\x80' # lone surrogate character (invalid)
b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff]
)
test_args = [valid_utf8, invalid_utf8]

for run_cmd in (run_default, run_c_locale, run_utf8_mode):
with self.subTest(run_cmd=run_cmd):
for arg in test_args:
proc = run_cmd(arg)
self.assertEqual(proc.stdout.rstrip(), ascii(arg))

@unittest.skipUnless((sys.platform == 'darwin' or
support.is_android), 'test specific to Mac OS X and Android')
def test_osx_android_utf8(self):
def check_output(text):
decoded = text.decode('utf-8', 'surrogateescape')
expected = ascii(decoded).encode('ascii') + b'\n'
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
code = "import sys; print(ascii(sys.argv[1]))"

env = os.environ.copy()
# C locale gives ASCII locale encoding, but Python uses UTF-8
# to parse the command line arguments on Mac OS X and Android.
env['LC_ALL'] = 'C'
decoded = text.decode('utf-8', 'surrogateescape')
expected = ascii(decoded).encode('ascii') + b'\n'

p = subprocess.Popen(
(sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
stdout=subprocess.PIPE,
env=env)
stdout, stderr = p.communicate()
self.assertEqual(stdout, expected)
self.assertEqual(p.returncode, 0)
env = os.environ.copy()
# C locale gives ASCII locale encoding, but Python uses UTF-8
# to parse the command line arguments on Mac OS X and Android.
env['LC_ALL'] = 'C'

# test valid utf-8
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
check_output(text)

# test invalid utf-8
text = (
b'\xff' # invalid byte
b'\xc3\xa9' # valid utf-8 character
b'\xc3\xff' # invalid byte sequence
b'\xed\xa0\x80' # lone surrogate character (invalid)
)
check_output(text)
p = subprocess.Popen(
(sys.executable, "-c", code, text),
stdout=subprocess.PIPE,
env=env)
stdout, stderr = p.communicate()
self.assertEqual(stdout, expected)
self.assertEqual(p.returncode, 0)

def test_non_interactive_output_buffering(self):
code = textwrap.dedent("""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Python no longer fails at startup with a fatal error if a command line
argument contains an invalid Unicode character. The
:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be
decoded as Unicode characters outside the [U+0000; U+10ffff] range.
9 changes: 5 additions & 4 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
extern "C" {
#endif

/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
// The value must be the same in fileutils.c.
#define MAX_UNICODE 0x10ffff

#ifdef Py_DEBUG
Expand Down Expand Up @@ -1784,8 +1785,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
*maxchar = ch;
if (*maxchar > MAX_UNICODE) {
PyErr_Format(PyExc_ValueError,
"character U+%x is not in range [U+0000; U+10ffff]",
ch);
"character U+%x is not in range [U+0000; U+%x]",
ch, MAX_UNICODE);
return -1;
}
}
Expand Down Expand Up @@ -14089,7 +14090,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
{
case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
default:
Py_UNREACHABLE();
}
Expand Down
118 changes: 79 additions & 39 deletions Python/fileutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
int _Py_open_cloexec_works = -1;
#endif

// The value must be the same in unicodeobject.c.
#define MAX_UNICODE 0x10ffff

// mbstowcs() and mbrtowc() errors
static const size_t DECODE_ERROR = ((size_t)-1);
static const size_t INCOMPLETE_CHARACTER = (size_t)-2;


static int
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
Expand Down Expand Up @@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
#endif
}


static size_t
is_valid_wide_char(wchar_t ch)
{
if (Py_UNICODE_IS_SURROGATE(ch)) {
// Reject lone surrogate characters
return 0;
}
if (ch > MAX_UNICODE) {
// bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
// The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
// it creates characters outside the [U+0000; U+10ffff] range:
// https://sourceware.org/bugzilla/show_bug.cgi?id=2373
return 0;
}
return 1;
}


static size_t
_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
{
size_t count = mbstowcs(dest, src, n);
if (dest != NULL && count != DECODE_ERROR) {
for (size_t i=0; i < count; i++) {
wchar_t ch = dest[i];
if (!is_valid_wide_char(ch)) {
return DECODE_ERROR;
}
}
}
return count;
}


#ifdef HAVE_MBRTOWC
static size_t
_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
{
assert(pwc != NULL);
size_t count = mbrtowc(pwc, str, len, pmbs);
if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
if (!is_valid_wide_char(*pwc)) {
return DECODE_ERROR;
}
}
return count;
}
#endif


#if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)

#define USE_FORCE_ASCII
Expand Down Expand Up @@ -148,8 +206,8 @@ check_force_ascii(void)
size_t res;

ch = (unsigned char)0xA7;
res = mbstowcs(&wch, (char*)&ch, 1);
if (res != (size_t)-1 && wch == L'\xA7') {
res = _Py_mbstowcs(&wch, (char*)&ch, 1);
if (res != DECODE_ERROR && wch == L'\xA7') {
/* On HP-UX withe C locale or the POSIX locale,
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
Latin1 encoding in practice. Force ASCII in this case.
Expand Down Expand Up @@ -196,8 +254,8 @@ check_force_ascii(void)

unsigned uch = (unsigned char)i;
ch[0] = (char)uch;
res = mbstowcs(wch, ch, 1);
if (res != (size_t)-1) {
res = _Py_mbstowcs(wch, ch, 1);
if (res != DECODE_ERROR) {
/* decoding a non-ASCII character from the locale encoding succeed:
the locale encoding is not ASCII, force ASCII */
return 1;
Expand Down Expand Up @@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
*/
argsize = strlen(arg);
#else
argsize = mbstowcs(NULL, arg, 0);
argsize = _Py_mbstowcs(NULL, arg, 0);
#endif
if (argsize != (size_t)-1) {
if (argsize != DECODE_ERROR) {
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
return -1;
}
Expand All @@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
return -1;
}

count = mbstowcs(res, arg, argsize + 1);
if (count != (size_t)-1) {
wchar_t *tmp;
/* Only use the result if it contains no
surrogate characters. */
for (tmp = res; *tmp != 0 &&
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
;
if (*tmp == 0) {
if (wlen != NULL) {
*wlen = count;
}
*wstr = res;
return 0;
count = _Py_mbstowcs(res, arg, argsize + 1);
if (count != DECODE_ERROR) {
*wstr = res;
if (wlen != NULL) {
*wlen = count;
}
return 0;
}
PyMem_RawFree(res);
}
Expand All @@ -436,46 +486,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
out = res;
memset(&mbs, 0, sizeof mbs);
while (argsize) {
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
if (converted == 0) {
/* Reached end of string; null char stored. */
break;
}

if (converted == (size_t)-2) {
if (converted == INCOMPLETE_CHARACTER) {
/* Incomplete character. This should never happen,
since we provide everything that we have -
unless there is a bug in the C library, or I
misunderstood how mbrtowc works. */
goto decode_error;
}

if (converted == (size_t)-1) {
if (converted == DECODE_ERROR) {
if (!surrogateescape) {
goto decode_error;
}

/* Conversion error. Escape as UTF-8b, and start over
in the initial shift state. */
/* Decoding error. Escape as UTF-8b, and start over in the initial
shift state. */
*out++ = 0xdc00 + *in++;
argsize--;
memset(&mbs, 0, sizeof mbs);
continue;
}

if (Py_UNICODE_IS_SURROGATE(*out)) {
if (!surrogateescape) {
goto decode_error;
}
// _Py_mbrtowc() reject lone surrogate characters
assert(!Py_UNICODE_IS_SURROGATE(*out));

/* Surrogate character. Escape the original
byte sequence with surrogateescape. */
argsize -= converted;
while (converted--) {
*out++ = 0xdc00 + *in++;
}
continue;
}
/* successfully converted some bytes */
in += converted;
argsize -= converted;
Expand Down Expand Up @@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
else {
converted = wcstombs(NULL, buf, 0);
}
if (converted == (size_t)-1) {
if (converted == DECODE_ERROR) {
goto encode_error;
}
if (bytes != NULL) {
Expand Down Expand Up @@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
char cmode[10];
size_t r;
r = wcstombs(cmode, mode, 10);
if (r == (size_t)-1 || r >= 10) {
if (r == DECODE_ERROR || r >= 10) {
errno = EINVAL;
return NULL;
}
Expand Down