Skip to content

Commit 4378648

Browse files
Move _Width_estimate_intervals_v2 to __msvc_format_ucd_tables.hpp (#4446)
Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
1 parent ddc5a62 commit 4378648

File tree

4 files changed

+67
-231
lines changed

4 files changed

+67
-231
lines changed

stl/inc/__msvc_format_ucd_tables.hpp

+19
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,25 @@ inline constexpr _Unicode_property_data<_Grapheme_Extend_property_values, 363, t
521521
0x1, 0x4, 0x1, 0x2, 0x2e, 0x17, 0x1, 0x3, 0x5, 0x8, 0x7, 0x4, 0x3, 0x37, 0x32, 0x1, 0x1, 0x5, 0xf, 0x7, 0x11,
522522
0x7, 0x2, 0x5, 0x1, 0x7, 0x1, 0x4, 0x4, 0x7, 0x7, 0x60, 0xf0}};
523523

524+
// EastAsianWidth-15.0.0.txt
525+
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
526+
inline constexpr char32_t _Width_estimate_intervals_v2[] = {0x1100, 0x1160, 0x231a, 0x231c, 0x2329, 0x232b, 0x23e9,
527+
0x23ed, 0x23f0, 0x23f1, 0x23f3, 0x23f4, 0x25fd, 0x25ff, 0x2614, 0x2616, 0x2648, 0x2654, 0x267f, 0x2680, 0x2693,
528+
0x2694, 0x26a1, 0x26a2, 0x26aa, 0x26ac, 0x26bd, 0x26bf, 0x26c4, 0x26c6, 0x26ce, 0x26cf, 0x26d4, 0x26d5, 0x26ea,
529+
0x26eb, 0x26f2, 0x26f4, 0x26f5, 0x26f6, 0x26fa, 0x26fb, 0x26fd, 0x26fe, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728,
530+
0x2729, 0x274c, 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf,
531+
0x27c0, 0x2b1b, 0x2b1d, 0x2b50, 0x2b51, 0x2b55, 0x2b56, 0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, 0x2f00, 0x2fd6, 0x2ff0,
532+
0x2ffc, 0x3000, 0x303f, 0x3041, 0x3097, 0x3099, 0x3100, 0x3105, 0x3130, 0x3131, 0x318f, 0x3190, 0x31e4, 0x31f0,
533+
0x321f, 0x3220, 0x3248, 0x3250, 0xa48d, 0xa490, 0xa4c7, 0xa960, 0xa97d, 0xac00, 0xd7a4, 0xf900, 0xfb00, 0xfe10,
534+
0xfe1a, 0xfe30, 0xfe53, 0xfe54, 0xfe67, 0xfe68, 0xfe6c, 0xff01, 0xff61, 0xffe0, 0xffe7, 0x16fe0, 0x16fe5, 0x16ff0,
535+
0x16ff2, 0x17000, 0x187f8, 0x18800, 0x18cd6, 0x18d00, 0x18d09, 0x1aff0, 0x1aff4, 0x1aff5, 0x1affc, 0x1affd, 0x1afff,
536+
0x1b000, 0x1b123, 0x1b132, 0x1b133, 0x1b150, 0x1b153, 0x1b155, 0x1b156, 0x1b164, 0x1b168, 0x1b170, 0x1b2fc, 0x1f004,
537+
0x1f005, 0x1f0cf, 0x1f0d0, 0x1f18e, 0x1f18f, 0x1f191, 0x1f19b, 0x1f200, 0x1f203, 0x1f210, 0x1f23c, 0x1f240, 0x1f249,
538+
0x1f250, 0x1f252, 0x1f260, 0x1f266, 0x1f300, 0x1f650, 0x1f680, 0x1f6c6, 0x1f6cc, 0x1f6cd, 0x1f6d0, 0x1f6d3, 0x1f6d5,
539+
0x1f6d8, 0x1f6dc, 0x1f6e0, 0x1f6eb, 0x1f6ed, 0x1f6f4, 0x1f6fd, 0x1f7e0, 0x1f7ec, 0x1f7f0, 0x1f7f1, 0x1f900, 0x1fa00,
540+
0x1fa70, 0x1fa7d, 0x1fa80, 0x1fa89, 0x1fa90, 0x1fabe, 0x1fabf, 0x1fac6, 0x1face, 0x1fadc, 0x1fae0, 0x1fae9, 0x1faf0,
541+
0x1faf9, 0x20000, 0x2fffe, 0x30000, 0x3fffe};
542+
524543
_STD_END
525544

526545
#pragma pop_macro("new")

stl/inc/format

-24
Original file line numberDiff line numberDiff line change
@@ -1018,30 +1018,6 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
10181018
#endif // ^^^ EDG workaround ^^^
10191019
}
10201020

1021-
// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
1022-
// in the https://github.com/microsoft/stl repository.
1023-
1024-
// EastAsianWidth-15.0.0.txt
1025-
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
1026-
inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
1027-
0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
1028-
0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,
1029-
0x26BDu, 0x26BFu, 0x26C4u, 0x26C6u, 0x26CEu, 0x26CFu, 0x26D4u, 0x26D5u, 0x26EAu, 0x26EBu, 0x26F2u, 0x26F4u, 0x26F5u,
1030-
0x26F6u, 0x26FAu, 0x26FBu, 0x26FDu, 0x26FEu, 0x2705u, 0x2706u, 0x270Au, 0x270Cu, 0x2728u, 0x2729u, 0x274Cu, 0x274Du,
1031-
0x274Eu, 0x274Fu, 0x2753u, 0x2756u, 0x2757u, 0x2758u, 0x2795u, 0x2798u, 0x27B0u, 0x27B1u, 0x27BFu, 0x27C0u, 0x2B1Bu,
1032-
0x2B1Du, 0x2B50u, 0x2B51u, 0x2B55u, 0x2B56u, 0x2E80u, 0x2E9Au, 0x2E9Bu, 0x2EF4u, 0x2F00u, 0x2FD6u, 0x2FF0u, 0x2FFCu,
1033-
0x3000u, 0x303Fu, 0x3041u, 0x3097u, 0x3099u, 0x3100u, 0x3105u, 0x3130u, 0x3131u, 0x318Fu, 0x3190u, 0x31E4u, 0x31F0u,
1034-
0x321Fu, 0x3220u, 0x3248u, 0x3250u, 0xA48Du, 0xA490u, 0xA4C7u, 0xA960u, 0xA97Du, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u,
1035-
0xFE10u, 0xFE1Au, 0xFE30u, 0xFE53u, 0xFE54u, 0xFE67u, 0xFE68u, 0xFE6Cu, 0xFF01u, 0xFF61u, 0xFFE0u, 0xFFE7u,
1036-
0x16FE0u, 0x16FE5u, 0x16FF0u, 0x16FF2u, 0x17000u, 0x187F8u, 0x18800u, 0x18CD6u, 0x18D00u, 0x18D09u, 0x1AFF0u,
1037-
0x1AFF4u, 0x1AFF5u, 0x1AFFCu, 0x1AFFDu, 0x1AFFFu, 0x1B000u, 0x1B123u, 0x1B132u, 0x1B133u, 0x1B150u, 0x1B153u,
1038-
0x1B155u, 0x1B156u, 0x1B164u, 0x1B168u, 0x1B170u, 0x1B2FCu, 0x1F004u, 0x1F005u, 0x1F0CFu, 0x1F0D0u, 0x1F18Eu,
1039-
0x1F18Fu, 0x1F191u, 0x1F19Bu, 0x1F200u, 0x1F203u, 0x1F210u, 0x1F23Cu, 0x1F240u, 0x1F249u, 0x1F250u, 0x1F252u,
1040-
0x1F260u, 0x1F266u, 0x1F300u, 0x1F650u, 0x1F680u, 0x1F6C6u, 0x1F6CCu, 0x1F6CDu, 0x1F6D0u, 0x1F6D3u, 0x1F6D5u,
1041-
0x1F6D8u, 0x1F6DCu, 0x1F6E0u, 0x1F6EBu, 0x1F6EDu, 0x1F6F4u, 0x1F6FDu, 0x1F7E0u, 0x1F7ECu, 0x1F7F0u, 0x1F7F1u,
1042-
0x1F900u, 0x1FA00u, 0x1FA70u, 0x1FA7Du, 0x1FA80u, 0x1FA89u, 0x1FA90u, 0x1FABEu, 0x1FABFu, 0x1FAC6u, 0x1FACEu,
1043-
0x1FADCu, 0x1FAE0u, 0x1FAE9u, 0x1FAF0u, 0x1FAF9u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};
1044-
10451021
_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
10461022
// Computes the width estimation for Unicode characters from N4950 [format.string.std]/13
10471023
// The two branches are functionally equivalent; `12` is chosen for performance here.

tools/unicode_properties_parse/format_width_estimate_intervals.py

-201
This file was deleted.

tools/unicode_properties_parse/unicode_properties_data_gen.py

+48-6
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
7777
}};
7878
"""
7979

80-
INTERVALS_TEMPLATE = """
80+
WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
8181
{filename}
8282
{timestamp}
83-
inline constexpr char32_t _{prop_name}_ranges[{size}] = {{
83+
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{
8484
{data}
8585
}};
8686
"""
@@ -148,7 +148,6 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
148148
#if _STL_COMPILER_PREPROCESSOR
149149
150150
#include <cstdint>
151-
#include <limits>
152151
#include <xutility>
153152
154153
#pragma pack(push, _CRT_PACKING)
@@ -166,7 +165,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
166165
uint16_t _Props_and_size[_NumRanges];
167166
_NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {{
168167
ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds;
169-
constexpr auto _No_value_constant = static_cast<_ValueEnum>((numeric_limits<uint8_t>::max)());
168+
constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX);
170169
if (_Upper_idx == 0) {{
171170
return _No_value_constant;
172171
}}
@@ -274,10 +273,22 @@ def read_file(filename: str) -> list[PropertyRange]:
274273
return filename, timestamp, ranges
275274

276275

276+
def generate_width_estimate_intervals(filename: str, timestamp: str, width_2_ranges: list[PropertyRange]):
277+
values = []
278+
279+
for width_2_range in width_2_ranges:
280+
values.append(width_2_range.lower)
281+
values.append(width_2_range.upper + 1)
282+
283+
return WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
284+
filename=filename, timestamp=timestamp, data=",".join(['0x' + format(x, 'x') for x in values]))
285+
286+
277287
def generate_data_tables() -> str:
278288
"""
279289
Generate Unicode data for inclusion into <format> from
280-
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, and DerivedCoreProperties.txt
290+
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, DerivedCoreProperties.txt,
291+
and EastAsianWidth.txt.
281292
282293
GraphemeBreakProperty.txt can be found at
283294
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
@@ -291,28 +302,59 @@ def generate_data_tables() -> str:
291302
DerivedCoreProperties.txt can be found at
292303
https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
293304
305+
EastAsianWidth.txt can be found at
306+
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
307+
294308
All files are expected to be in the same directory as this script.
295309
"""
296310
gbp_filename, gbp_timestamp, gbp_ranges = read_file("GraphemeBreakProperty.txt")
297311
emoji_filename, emoji_timestamp, emoji_ranges = read_file("emoji-data.txt")
298312
cat_filename, cat_timestamp, cat_ranges = read_file("DerivedGeneralCategory.txt")
299313
derived_filename, derived_timestamp, derived_ranges = read_file("DerivedCoreProperties.txt")
314+
eaw_filename, eaw_timestamp, eaw_ranges = read_file("EastAsianWidth.txt")
300315

301316
printable_ranges = compact_property_ranges(sorted([
302317
PropertyRange(x.lower, x.upper, "Yes")
303318
for x in cat_ranges
304319
if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' '
305320
], key=lambda x: x.lower))
306321

322+
# N4971 [format.string.std]/13
323+
std_wide_ranges = [
324+
range(0x4DC0, 0x4DFF),
325+
range(0x1F300, 0x1F5FF),
326+
range(0x1F900, 0x1F9FF),
327+
]
328+
329+
def has_width_2(prop_range):
330+
if prop_range.prop in ("F", "W"):
331+
return True
332+
333+
for std_wide_range in std_wide_ranges:
334+
if prop_range.lower in std_wide_range:
335+
assert prop_range.upper <= std_wide_range.stop
336+
337+
return True
338+
else:
339+
assert prop_range.upper not in std_wide_range
340+
341+
return False
342+
343+
width_2_ranges = compact_property_ranges(sorted([
344+
PropertyRange(x.lower, x.upper, "Yes") for x in eaw_ranges if has_width_2(x)
345+
], key=lambda x: x.lower))
346+
307347
gpb_cpp_data = generate_cpp_data(gbp_filename, gbp_timestamp, "Grapheme_Break", gbp_ranges)
308348
emoji_cpp_data = generate_cpp_data(emoji_filename, emoji_timestamp, "Extended_Pictographic", [
309349
x for x in emoji_ranges if x.prop == "Extended_Pictographic"])
310350
# _printable follows a different naming scheme, to indicate that it is a fake Unicode property.
311351
printable_cpp_data = generate_cpp_data(cat_filename, cat_timestamp, "_printable", printable_ranges)
312352
grapheme_extend_cpp_data = generate_cpp_data(derived_filename, derived_timestamp, "Grapheme_Extend", [
313353
x for x in derived_ranges if x.prop == "Grapheme_Extend"])
354+
width_estimate_intervals = generate_width_estimate_intervals(eaw_filename, eaw_timestamp, width_2_ranges)
314355

315-
return "\n".join([gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data])
356+
return "\n".join(
357+
[gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data, width_estimate_intervals])
316358

317359

318360
if __name__ == "__main__":

0 commit comments

Comments
 (0)