Skip to content

Commit 224f2ce

Browse files
authored
Auto merge of #37855 - tbu-:pr_fix_debug_str, r=alexcrichton
Fix `fmt::Debug` for strings, e.g. for Chinese characters The problem occured due to lines like ``` 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; ``` in `UnicodeData.txt`, which the script previously interpreted as two characters, although it represents the whole range. Fixes #34318.
2 parents e5ed0a5 + d0bb7e1 commit 224f2ce

File tree

3 files changed

+276
-105
lines changed

3 files changed

+276
-105
lines changed

src/etc/char_private.py

+44-8
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,16 @@
1111
# except according to those terms.
1212

1313
# This script uses the following Unicode tables:
14-
# - Categories.txt
14+
# - UnicodeData.txt
1515

16+
17+
from collections import namedtuple
18+
import csv
1619
import os
1720
import subprocess
1821

22+
NUM_CODEPOINTS=0x110000
23+
1924
def to_ranges(iter):
2025
current = None
2126
for i in iter:
@@ -28,10 +33,10 @@ def to_ranges(iter):
2833
if current is not None:
2934
yield tuple(current)
3035

31-
def get_escaped(dictionary):
32-
for i in range(0x110000):
33-
if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '):
34-
yield i
36+
def get_escaped(codepoints):
37+
for c in codepoints:
38+
if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
39+
yield c.value
3540

3641
def get_file(f):
3742
try:
@@ -40,10 +45,41 @@ def get_file(f):
4045
subprocess.run(["curl", "-O", f], check=True)
4146
return open(os.path.basename(f))
4247

48+
Codepoint = namedtuple('Codepoint', 'value class_')
49+
50+
def get_codepoints(f):
51+
r = csv.reader(f, delimiter=";")
52+
prev_codepoint = 0
53+
class_first = None
54+
for row in r:
55+
codepoint = int(row[0], 16)
56+
name = row[1]
57+
class_ = row[2]
58+
59+
if class_first is not None:
60+
if not name.endswith("Last>"):
61+
raise ValueError("Missing Last after First")
62+
63+
for c in range(prev_codepoint + 1, codepoint):
64+
yield Codepoint(c, class_first)
65+
66+
class_first = None
67+
if name.endswith("First>"):
68+
class_first = class_
69+
70+
yield Codepoint(codepoint, class_)
71+
prev_codepoint = codepoint
72+
73+
if class_first != None:
74+
raise ValueError("Missing Last after First")
75+
76+
for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
77+
yield Codepoint(c, None)
78+
4379
def main():
44-
file = get_file("http://www.unicode.org/notes/tn36/Categories.txt")
80+
file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
4581

46-
dictionary = {int(line.split()[0], 16): line.split()[1] for line in file}
82+
codepoints = get_codepoints(file)
4783

4884
CUTOFF=0x10000
4985
singletons0 = []
@@ -52,7 +88,7 @@ def main():
5288
normal1 = []
5389
extra = []
5490

55-
for a, b in to_ranges(get_escaped(dictionary)):
91+
for a, b in to_ranges(get_escaped(codepoints)):
5692
if a > 2 * CUTOFF:
5793
extra.append((a, b - a))
5894
elif a == b - 1:

0 commit comments

Comments
 (0)