Skip to content

Commit f215d7c

Browse files
authored
gh-94823: Improve coverage in tokenizer.c:valid_utf8 (GH-94856)
When loading a source file from disk, there is a separate UTF-8 validator distinct from the one in `unicode_decode_utf8`. This exercises that code path with the same set of invalid inputs as we use for testing the "other" UTF-8 decoder.
1 parent 9d51599 commit f215d7c

File tree

1 file changed

+61
-0
lines changed

1 file changed

+61
-0
lines changed

Lib/test/test_source_encoding.py

+61
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,67 @@ def test_crcrcrlf2(self):
224224
out = self.check_script_output(src, br"'\n\n\n'")
225225

226226

227+
class UTF8ValidatorTest(unittest.TestCase):
228+
@unittest.skipIf(not sys.platform.startswith("linux"),
229+
"Too slow to run on non-Linux platforms")
230+
def test_invalid_utf8(self):
231+
# This is a port of test_utf8_decode_invalid_sequences in
232+
# test_unicode.py to exercise the separate utf8 validator in
233+
# Parser/tokenizer.c used when reading source files.
234+
235+
# That file is written using low-level C file I/O, so the only way to
236+
# test it is to write actual files to disk.
237+
238+
# Each example is put inside a string at the top of the file so
239+
# it's an otherwise valid Python source file.
240+
template = b'"%s"\n'
241+
242+
with tempfile.TemporaryDirectory() as tmpd:
243+
fn = os.path.join(tmpd, 'test.py')
244+
245+
def check(content):
246+
with open(fn, 'wb') as fp:
247+
fp.write(template % content)
248+
script_helper.assert_python_failure(fn)
249+
250+
# continuation bytes in a sequence of 2, 3, or 4 bytes
251+
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
252+
# start bytes of a 2-byte sequence equivalent to code points < 0x7F
253+
invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
254+
# start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
255+
invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
256+
invalid_start_bytes = (
257+
continuation_bytes + invalid_2B_seq_start_bytes +
258+
invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
259+
)
260+
261+
for byte in invalid_start_bytes:
262+
check(byte)
263+
264+
for sb in invalid_2B_seq_start_bytes:
265+
for cb in continuation_bytes:
266+
check(sb + cb)
267+
268+
for sb in invalid_4B_seq_start_bytes:
269+
for cb1 in continuation_bytes[:3]:
270+
for cb3 in continuation_bytes[:3]:
271+
check(sb+cb1+b'\x80'+cb3)
272+
273+
for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
274+
check(b'\xE0'+cb+b'\x80')
275+
check(b'\xE0'+cb+b'\xBF')
276+
# surrogates
277+
for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
278+
check(b'\xED'+cb+b'\x80')
279+
check(b'\xED'+cb+b'\xBF')
280+
for cb in [bytes([x]) for x in range(0x80, 0x90)]:
281+
check(b'\xF0'+cb+b'\x80\x80')
282+
check(b'\xF0'+cb+b'\xBF\xBF')
283+
for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
284+
check(b'\xF4'+cb+b'\x80\x80')
285+
check(b'\xF4'+cb+b'\xBF\xBF')
286+
287+
227288
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
228289

229290
def check_script_output(self, src, expected):

0 commit comments

Comments
 (0)