Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[3.11] gh-94823: Improve coverage in tokenizer.c:valid_utf8 (GH-94856) #96029

Merged
merged 1 commit into from
Aug 16, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,67 @@ def test_crcrcrlf2(self):
out = self.check_script_output(src, br"'\n\n\n'")


class UTF8ValidatorTest(unittest.TestCase):
@unittest.skipIf(not sys.platform.startswith("linux"),
"Too slow to run on non-Linux platforms")
def test_invalid_utf8(self):
# This is a port of test_utf8_decode_invalid_sequences in
# test_unicode.py to exercise the separate utf8 validator in
# Parser/tokenizer.c used when reading source files.

# That file is written using low-level C file I/O, so the only way to
# test it is to write actual files to disk.

# Each example is put inside a string at the top of the file so
# it's an otherwise valid Python source file.
template = b'"%s"\n'

with tempfile.TemporaryDirectory() as tmpd:
fn = os.path.join(tmpd, 'test.py')

def check(content):
with open(fn, 'wb') as fp:
fp.write(template % content)
script_helper.assert_python_failure(fn)

# continuation bytes in a sequence of 2, 3, or 4 bytes
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
# start bytes of a 2-byte sequence equivalent to code points < 0x7F
invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
# start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
invalid_start_bytes = (
continuation_bytes + invalid_2B_seq_start_bytes +
invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
)

for byte in invalid_start_bytes:
check(byte)

for sb in invalid_2B_seq_start_bytes:
for cb in continuation_bytes:
check(sb + cb)

for sb in invalid_4B_seq_start_bytes:
for cb1 in continuation_bytes[:3]:
for cb3 in continuation_bytes[:3]:
check(sb+cb1+b'\x80'+cb3)

for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
check(b'\xE0'+cb+b'\x80')
check(b'\xE0'+cb+b'\xBF')
# surrogates
for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
check(b'\xED'+cb+b'\x80')
check(b'\xED'+cb+b'\xBF')
for cb in [bytes([x]) for x in range(0x80, 0x90)]:
check(b'\xF0'+cb+b'\x80\x80')
check(b'\xF0'+cb+b'\xBF\xBF')
for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
check(b'\xF4'+cb+b'\x80\x80')
check(b'\xF4'+cb+b'\xBF\xBF')


class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

def check_script_output(self, src, expected):
Expand Down