From 26f553d0685b8ab2b987efe1d9f8e0f519db129b Mon Sep 17 00:00:00 2001 From: cpplearner Date: Mon, 4 Mar 2024 21:04:35 +0800 Subject: [PATCH 1/3] Move `_Width_estimate_intervals_v2` to `__msvc_format_ucd_tables.hpp` --- stl/inc/__msvc_format_ucd_tables.hpp | 19 +++++++ stl/inc/format | 24 --------- .../grapheme_break_property_data_gen.py | 53 ++++++++++++++++--- 3 files changed, 66 insertions(+), 30 deletions(-) diff --git a/stl/inc/__msvc_format_ucd_tables.hpp b/stl/inc/__msvc_format_ucd_tables.hpp index 5f0d7161b2..6ca81753c3 100644 --- a/stl/inc/__msvc_format_ucd_tables.hpp +++ b/stl/inc/__msvc_format_ucd_tables.hpp @@ -521,6 +521,25 @@ inline constexpr _Unicode_property_data<_Grapheme_Extend_property_values, 363, t 0x1, 0x4, 0x1, 0x2, 0x2e, 0x17, 0x1, 0x3, 0x5, 0x8, 0x7, 0x4, 0x3, 0x37, 0x32, 0x1, 0x1, 0x5, 0xf, 0x7, 0x11, 0x7, 0x2, 0x5, 0x1, 0x7, 0x1, 0x4, 0x4, 0x7, 0x7, 0x60, 0xf0}}; +// EastAsianWidth-15.0.0.txt +// Date: 2022-05-24, 17:40:20 GMT [KW, LI] +inline constexpr char32_t _Width_estimate_intervals_v2[] = {0x1100, 0x1160, 0x231a, 0x231c, 0x2329, 0x232b, 0x23e9, + 0x23ed, 0x23f0, 0x23f1, 0x23f3, 0x23f4, 0x25fd, 0x25ff, 0x2614, 0x2616, 0x2648, 0x2654, 0x267f, 0x2680, 0x2693, + 0x2694, 0x26a1, 0x26a2, 0x26aa, 0x26ac, 0x26bd, 0x26bf, 0x26c4, 0x26c6, 0x26ce, 0x26cf, 0x26d4, 0x26d5, 0x26ea, + 0x26eb, 0x26f2, 0x26f4, 0x26f5, 0x26f6, 0x26fa, 0x26fb, 0x26fd, 0x26fe, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728, + 0x2729, 0x274c, 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf, + 0x27c0, 0x2b1b, 0x2b1d, 0x2b50, 0x2b51, 0x2b55, 0x2b56, 0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, 0x2f00, 0x2fd6, 0x2ff0, + 0x2ffc, 0x3000, 0x303f, 0x3041, 0x3097, 0x3099, 0x3100, 0x3105, 0x3130, 0x3131, 0x318f, 0x3190, 0x31e4, 0x31f0, + 0x321f, 0x3220, 0x3248, 0x3250, 0xa48d, 0xa490, 0xa4c7, 0xa960, 0xa97d, 0xac00, 0xd7a4, 0xf900, 0xfb00, 0xfe10, + 0xfe1a, 0xfe30, 0xfe53, 0xfe54, 0xfe67, 0xfe68, 0xfe6c, 0xff01, 0xff61, 0xffe0, 0xffe7, 0x16fe0, 0x16fe5, 0x16ff0, + 0x16ff2, 0x17000, 0x187f8, 0x18800, 0x18cd6, 0x18d00, 0x18d09, 0x1aff0, 0x1aff4, 0x1aff5, 0x1affc, 0x1affd, 0x1afff, + 0x1b000, 0x1b123, 0x1b132, 0x1b133, 0x1b150, 0x1b153, 0x1b155, 0x1b156, 0x1b164, 0x1b168, 0x1b170, 0x1b2fc, 0x1f004, + 0x1f005, 0x1f0cf, 0x1f0d0, 0x1f18e, 0x1f18f, 0x1f191, 0x1f19b, 0x1f200, 0x1f203, 0x1f210, 0x1f23c, 0x1f240, 0x1f249, + 0x1f250, 0x1f252, 0x1f260, 0x1f266, 0x1f300, 0x1f650, 0x1f680, 0x1f6c6, 0x1f6cc, 0x1f6cd, 0x1f6d0, 0x1f6d3, 0x1f6d5, + 0x1f6d8, 0x1f6dc, 0x1f6e0, 0x1f6eb, 0x1f6ed, 0x1f6f4, 0x1f6fd, 0x1f7e0, 0x1f7ec, 0x1f7f0, 0x1f7f1, 0x1f900, 0x1fa00, + 0x1fa70, 0x1fa7d, 0x1fa80, 0x1fa89, 0x1fa90, 0x1fabe, 0x1fabf, 0x1fac6, 0x1face, 0x1fadc, 0x1fae0, 0x1fae9, 0x1faf0, + 0x1faf9, 0x20000, 0x2fffe, 0x30000, 0x3fffe}; + _STD_END #pragma pop_macro("new") diff --git a/stl/inc/format b/stl/inc/format index f22b4447fd..4f7a3ffa3a 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -1018,30 +1018,6 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() { #endif // ^^^ EDG workaround ^^^ } -// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py -// in the https://github.com/microsoft/stl repository. - -// EastAsianWidth-15.0.0.txt -// Date: 2022-05-24, 17:40:20 GMT [KW, LI] -inline constexpr char32_t _Width_estimate_intervals_v2[] = { // - 0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu, - 0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu, - 0x26BDu, 0x26BFu, 0x26C4u, 0x26C6u, 0x26CEu, 0x26CFu, 0x26D4u, 0x26D5u, 0x26EAu, 0x26EBu, 0x26F2u, 0x26F4u, 0x26F5u, - 0x26F6u, 0x26FAu, 0x26FBu, 0x26FDu, 0x26FEu, 0x2705u, 0x2706u, 0x270Au, 0x270Cu, 0x2728u, 0x2729u, 0x274Cu, 0x274Du, - 0x274Eu, 0x274Fu, 0x2753u, 0x2756u, 0x2757u, 0x2758u, 0x2795u, 0x2798u, 0x27B0u, 0x27B1u, 0x27BFu, 0x27C0u, 0x2B1Bu, - 0x2B1Du, 0x2B50u, 0x2B51u, 0x2B55u, 0x2B56u, 0x2E80u, 0x2E9Au, 0x2E9Bu, 0x2EF4u, 0x2F00u, 0x2FD6u, 0x2FF0u, 0x2FFCu, - 0x3000u, 0x303Fu, 0x3041u, 0x3097u, 0x3099u, 0x3100u, 0x3105u, 0x3130u, 0x3131u, 0x318Fu, 0x3190u, 0x31E4u, 0x31F0u, - 0x321Fu, 0x3220u, 0x3248u, 0x3250u, 0xA48Du, 0xA490u, 0xA4C7u, 0xA960u, 0xA97Du, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, - 0xFE10u, 0xFE1Au, 0xFE30u, 0xFE53u, 0xFE54u, 0xFE67u, 0xFE68u, 0xFE6Cu, 0xFF01u, 0xFF61u, 0xFFE0u, 0xFFE7u, - 0x16FE0u, 0x16FE5u, 0x16FF0u, 0x16FF2u, 0x17000u, 0x187F8u, 0x18800u, 0x18CD6u, 0x18D00u, 0x18D09u, 0x1AFF0u, - 0x1AFF4u, 0x1AFF5u, 0x1AFFCu, 0x1AFFDu, 0x1AFFFu, 0x1B000u, 0x1B123u, 0x1B132u, 0x1B133u, 0x1B150u, 0x1B153u, - 0x1B155u, 0x1B156u, 0x1B164u, 0x1B168u, 0x1B170u, 0x1B2FCu, 0x1F004u, 0x1F005u, 0x1F0CFu, 0x1F0D0u, 0x1F18Eu, - 0x1F18Fu, 0x1F191u, 0x1F19Bu, 0x1F200u, 0x1F203u, 0x1F210u, 0x1F23Cu, 0x1F240u, 0x1F249u, 0x1F250u, 0x1F252u, - 0x1F260u, 0x1F266u, 0x1F300u, 0x1F650u, 0x1F680u, 0x1F6C6u, 0x1F6CCu, 0x1F6CDu, 0x1F6D0u, 0x1F6D3u, 0x1F6D5u, - 0x1F6D8u, 0x1F6DCu, 0x1F6E0u, 0x1F6EBu, 0x1F6EDu, 0x1F6F4u, 0x1F6FDu, 0x1F7E0u, 0x1F7ECu, 0x1F7F0u, 0x1F7F1u, - 0x1F900u, 0x1FA00u, 0x1FA70u, 0x1FA7Du, 0x1FA80u, 0x1FA89u, 0x1FA90u, 0x1FABEu, 0x1FABFu, 0x1FAC6u, 0x1FACEu, - 0x1FADCu, 0x1FAE0u, 0x1FAE9u, 0x1FAF0u, 0x1FAF9u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu}; - _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept { // Computes the width estimation for Unicode characters from N4950 [format.string.std]/13 // The two branches are functionally equivalent; `12` is chosen for performance here. diff --git a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py b/tools/unicode_properties_parse/grapheme_break_property_data_gen.py index d330700495..a3a642f36b 100644 --- a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py +++ b/tools/unicode_properties_parse/grapheme_break_property_data_gen.py @@ -77,10 +77,10 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]: }}; """ -INTERVALS_TEMPLATE = """ +WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """ {filename} {timestamp} -inline constexpr char32_t _{prop_name}_ranges[{size}] = {{ +inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ {data} }}; """ @@ -148,7 +148,6 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]: #if _STL_COMPILER_PREPROCESSOR #include -#include #include #pragma pack(push, _CRT_PACKING) @@ -166,7 +165,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]: uint16_t _Props_and_size[_NumRanges]; _NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {{ ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds; - constexpr auto _No_value_constant = static_cast<_ValueEnum>((numeric_limits::max)()); + constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX); if (_Upper_idx == 0) {{ return _No_value_constant; }} @@ -274,10 +273,22 @@ def read_file(filename: str) -> list[PropertyRange]: return filename, timestamp, ranges +def generate_width_estimate_intervals(filename: str, timestamp: str, width_2_ranges: list[PropertyRange]): + values = [] + + for width_2_range in width_2_ranges: + values.append(width_2_range.lower) + values.append(width_2_range.upper + 1) + + return WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format( + filename=filename, timestamp=timestamp, data=",".join(['0x' + format(x, 'x') for x in values])) + + def generate_data_tables() -> str: """ Generate Unicode data for inclusion into from - GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, and DerivedCoreProperties.txt + GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, DerivedCoreProperties.txt, + and EastAsianWidth.txt. GraphemeBreakProperty.txt can be found at https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt @@ -291,12 +302,16 @@ def generate_data_tables() -> str: DerivedCoreProperties.txt can be found at https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + EastAsianWidth.txt can be found at + https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt + All files are expected to be in the same directory as this script. """ gbp_filename, gbp_timestamp, gbp_ranges = read_file("GraphemeBreakProperty.txt") emoji_filename, emoji_timestamp, emoji_ranges = read_file("emoji-data.txt") cat_filename, cat_timestamp, cat_ranges = read_file("DerivedGeneralCategory.txt") derived_filename, derived_timestamp, derived_ranges = read_file("DerivedCoreProperties.txt") + eaw_filename, eaw_timestamp, eaw_ranges = read_file("EastAsianWidth.txt") printable_ranges = compact_property_ranges(sorted([ PropertyRange(x.lower, x.upper, "Yes") @@ -304,6 +319,30 @@ def generate_data_tables() -> str: if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' ' ], key=lambda x: x.lower)) + std_wide_ranges = [ + range(0x4DC0, 0x4DFF), + range(0x1F300, 0x1F5FF), + range(0x1F900, 0x1F9FF), + ] + + def has_width_2(prop_range): + if prop_range.prop in ("F", "W"): + return True + + for std_wide_range in std_wide_ranges: + if prop_range.lower in std_wide_range: + assert prop_range.upper <= std_wide_range.stop + + return True + else: + assert prop_range.upper not in std_wide_range + + return False + + width_2_ranges = compact_property_ranges(sorted([ + PropertyRange(x.lower, x.upper, "Yes") for x in eaw_ranges if has_width_2(x) + ], key=lambda x: x.lower)) + gpb_cpp_data = generate_cpp_data(gbp_filename, gbp_timestamp, "Grapheme_Break", gbp_ranges) emoji_cpp_data = generate_cpp_data(emoji_filename, emoji_timestamp, "Extended_Pictographic", [ x for x in emoji_ranges if x.prop == "Extended_Pictographic"]) @@ -311,8 +350,10 @@ def generate_data_tables() -> str: printable_cpp_data = generate_cpp_data(cat_filename, cat_timestamp, "_printable", printable_ranges) grapheme_extend_cpp_data = generate_cpp_data(derived_filename, derived_timestamp, "Grapheme_Extend", [ x for x in derived_ranges if x.prop == "Grapheme_Extend"]) + width_estimate_intervals = generate_width_estimate_intervals(eaw_filename, eaw_timestamp, width_2_ranges) - return "\n".join([gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data]) + return "\n".join( + [gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data, width_estimate_intervals]) if __name__ == "__main__": From 0186b4aa8b57f72319d500410ed56ca3d0d84c31 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 5 Mar 2024 16:56:40 -0800 Subject: [PATCH 2/3] Delete format_width_estimate_intervals.py. --- .../format_width_estimate_intervals.py | 201 ------------------ 1 file changed, 201 deletions(-) delete mode 100644 tools/unicode_properties_parse/format_width_estimate_intervals.py diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py deleted file mode 100644 index b0c4bc8a79..0000000000 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -# The following code generates data for _Width_estimate_intervals_v2 in . - -import re -from enum import Enum -from typing import TextIO -from pathlib import Path - - -# Width estimation. -class UnicodeWidth(Enum): - IS_1: int = 1 - IS_2: int = 2 - - -class UnicodeWidthTable: - # A valid Unicode code point won't exceed MAX_CODE_POINT. - MAX_CODE_POINT: int = 0x10FFFF - TABLE_SIZE: int = MAX_CODE_POINT + 1 - - def __init__(self): - self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE) - - # "rng" denotes a right-closed range. - def fill_range(self, rng: tuple, width: int): - from_, to_ = rng - assert from_ <= to_, "invalid range" - assert to_ <= self.MAX_CODE_POINT, "invalid range" - self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1) - - def width_estimate_intervals(self): - """ - Creates a string representation of the map (in `self.table`) from - unicode code points to their width, using hexadecimal unsigned integer literals. - Since there are long runs of code points of one width or the other, - this representation is a list of code points where the width switches. - Additionally, the width is assumed to start at `1` from the beginning of the list. - For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`. - """ - values = [] - assert self.table[0] == UnicodeWidth.IS_1 - for u in range(1, self.TABLE_SIZE): - assert ( - self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2 - ) - if self.table[u] != self.table[u - 1]: - values.append(u) - - return ", ".join([f"0x{u:X}u" for u in values]) - - # Print all ranges (right-closed), where self's width is 1 and other's width is 2. - def print_ranges_1_vs_2(self, other): - def _1_vs_2(u: int): - return ( - self.table[u] == UnicodeWidth.IS_1 - and other.table[u] == UnicodeWidth.IS_2 - ) - - u = 0 - while u < self.TABLE_SIZE: - if _1_vs_2(u): - from_ = u - to_ = from_ - while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1): - to_ += 1 - if from_ == to_: - print(f"U+{from_:X}") - else: - print(f"U+{from_:X}..U+{to_:X}") - u = to_ - u += 1 - - -def get_table_cpp20() -> UnicodeWidthTable: - std_wide_ranges_cpp20 = [ - (0x1100, 0x115F), - (0x2329, 0x232A), - (0x2E80, 0x303E), - (0x3040, 0xA4CF), - (0xAC00, 0xD7A3), - (0xF900, 0xFAFF), - (0xFE10, 0xFE19), - (0xFE30, 0xFE6F), - (0xFF00, 0xFF60), - (0xFFE0, 0xFFE6), - (0x1F300, 0x1F64F), - (0x1F900, 0x1F9FF), - (0x20000, 0x2FFFD), - (0x30000, 0x3FFFD), - ] - - table = UnicodeWidthTable() - for rng in std_wide_ranges_cpp20: - table.fill_range(rng, UnicodeWidth.IS_2) - - return table - - -def read_from(source: TextIO) -> UnicodeWidthTable: - """ - Read data from "EastAsianWidth.txt". - The latest version can be found at: - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt - The current implementation works for: - https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt - To make this function work, the file should not contain a BOM. - """ - table = UnicodeWidthTable() - - # "The unassigned code points in the following blocks default to "W":" - default_wide_ranges = [ - (0x4E00, 0x9FFF), - (0x3400, 0x4DBF), - (0xF900, 0xFAFF), - (0x20000, 0x2FFFD), - (0x30000, 0x3FFFD), - ] - for rng in default_wide_ranges: - table.fill_range(rng, UnicodeWidth.IS_2) - - # Read explicitly assigned ranges. - # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment". - LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*") - - def get_width(str: str): - if str == "F" or str == "W": - return UnicodeWidth.IS_2 - else: - assert str == "A" or str == "H" or str == "N" or str == "Na" - return UnicodeWidth.IS_1 - - for line in source: - line = line.strip() - if line and not line.startswith("#"): - match = LINE_REGEX.fullmatch(line) - assert match, line # invalid line - from_val = int(match.group(1), base=16) - width = get_width(match.group(3)) - if match.group(2): - # range (HEX..HEX) - to_val = int(match.group(2)[2:], base=16) - table.fill_range((from_val, to_val), width) - else: - # single character (HEX) - table.table[from_val] = width - - return table - - -def get_table_cpp23(source: TextIO) -> UnicodeWidthTable: - table = read_from(source) - - # Override with ranges specified by N4958 [format.string.std]/13. - std_wide_ranges_cpp23 = [ - (0x4DC0, 0x4DFF), - (0x1F300, 0x1F5FF), - (0x1F900, 0x1F9FF), - ] - - for rng in std_wide_ranges_cpp23: - table.fill_range(rng, UnicodeWidth.IS_2) - - return table - - -WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """ -{filename} -{timestamp} -inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ // -{values} }}; -""" - - -def main(): - print("Old table:") - old_table = get_table_cpp20() - print(old_table.width_estimate_intervals()) - - path = Path(__file__).absolute().with_name("EastAsianWidth.txt") - with open(path, mode="rt", encoding="utf-8") as source: - filename = source.readline().replace("#", "//").rstrip() - timestamp = source.readline().replace("#", "//").rstrip() - new_table = get_table_cpp23(source) - print("\nNew table:") - print( - WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format( - filename=filename, - timestamp=timestamp, - values=new_table.width_estimate_intervals(), - ) - ) - print("Was 1, now 2:") - old_table.print_ranges_1_vs_2(new_table) - print("\nWas 2, now 1:") - new_table.print_ranges_1_vs_2(old_table) - - -if __name__ == "__main__": - main() From d8b4686d039527fe9a9f7b493112a4eb80db7f27 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 5 Mar 2024 16:58:22 -0800 Subject: [PATCH 3/3] Cite N4971 [format.string.std]/13. --- .../unicode_properties_parse/grapheme_break_property_data_gen.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py b/tools/unicode_properties_parse/grapheme_break_property_data_gen.py index a3a642f36b..0b72df7c0e 100644 --- a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py +++ b/tools/unicode_properties_parse/grapheme_break_property_data_gen.py @@ -319,6 +319,7 @@ def generate_data_tables() -> str: if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' ' ], key=lambda x: x.lower)) + # N4971 [format.string.std]/13 std_wide_ranges = [ range(0x4DC0, 0x4DFF), range(0x1F300, 0x1F5FF),