Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move _Width_estimate_intervals_v2 to __msvc_format_ucd_tables.hpp #4446

Merged
merged 4 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions stl/inc/__msvc_format_ucd_tables.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,25 @@ inline constexpr _Unicode_property_data<_Grapheme_Extend_property_values, 363, t
0x1, 0x4, 0x1, 0x2, 0x2e, 0x17, 0x1, 0x3, 0x5, 0x8, 0x7, 0x4, 0x3, 0x37, 0x32, 0x1, 0x1, 0x5, 0xf, 0x7, 0x11,
0x7, 0x2, 0x5, 0x1, 0x7, 0x1, 0x4, 0x4, 0x7, 0x7, 0x60, 0xf0}};

// EastAsianWidth-15.0.0.txt
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
inline constexpr char32_t _Width_estimate_intervals_v2[] = {0x1100, 0x1160, 0x231a, 0x231c, 0x2329, 0x232b, 0x23e9,
0x23ed, 0x23f0, 0x23f1, 0x23f3, 0x23f4, 0x25fd, 0x25ff, 0x2614, 0x2616, 0x2648, 0x2654, 0x267f, 0x2680, 0x2693,
0x2694, 0x26a1, 0x26a2, 0x26aa, 0x26ac, 0x26bd, 0x26bf, 0x26c4, 0x26c6, 0x26ce, 0x26cf, 0x26d4, 0x26d5, 0x26ea,
0x26eb, 0x26f2, 0x26f4, 0x26f5, 0x26f6, 0x26fa, 0x26fb, 0x26fd, 0x26fe, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728,
0x2729, 0x274c, 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf,
0x27c0, 0x2b1b, 0x2b1d, 0x2b50, 0x2b51, 0x2b55, 0x2b56, 0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, 0x2f00, 0x2fd6, 0x2ff0,
0x2ffc, 0x3000, 0x303f, 0x3041, 0x3097, 0x3099, 0x3100, 0x3105, 0x3130, 0x3131, 0x318f, 0x3190, 0x31e4, 0x31f0,
0x321f, 0x3220, 0x3248, 0x3250, 0xa48d, 0xa490, 0xa4c7, 0xa960, 0xa97d, 0xac00, 0xd7a4, 0xf900, 0xfb00, 0xfe10,
0xfe1a, 0xfe30, 0xfe53, 0xfe54, 0xfe67, 0xfe68, 0xfe6c, 0xff01, 0xff61, 0xffe0, 0xffe7, 0x16fe0, 0x16fe5, 0x16ff0,
0x16ff2, 0x17000, 0x187f8, 0x18800, 0x18cd6, 0x18d00, 0x18d09, 0x1aff0, 0x1aff4, 0x1aff5, 0x1affc, 0x1affd, 0x1afff,
0x1b000, 0x1b123, 0x1b132, 0x1b133, 0x1b150, 0x1b153, 0x1b155, 0x1b156, 0x1b164, 0x1b168, 0x1b170, 0x1b2fc, 0x1f004,
0x1f005, 0x1f0cf, 0x1f0d0, 0x1f18e, 0x1f18f, 0x1f191, 0x1f19b, 0x1f200, 0x1f203, 0x1f210, 0x1f23c, 0x1f240, 0x1f249,
0x1f250, 0x1f252, 0x1f260, 0x1f266, 0x1f300, 0x1f650, 0x1f680, 0x1f6c6, 0x1f6cc, 0x1f6cd, 0x1f6d0, 0x1f6d3, 0x1f6d5,
0x1f6d8, 0x1f6dc, 0x1f6e0, 0x1f6eb, 0x1f6ed, 0x1f6f4, 0x1f6fd, 0x1f7e0, 0x1f7ec, 0x1f7f0, 0x1f7f1, 0x1f900, 0x1fa00,
0x1fa70, 0x1fa7d, 0x1fa80, 0x1fa89, 0x1fa90, 0x1fabe, 0x1fabf, 0x1fac6, 0x1face, 0x1fadc, 0x1fae0, 0x1fae9, 0x1faf0,
0x1faf9, 0x20000, 0x2fffe, 0x30000, 0x3fffe};

_STD_END

#pragma pop_macro("new")
Expand Down
24 changes: 0 additions & 24 deletions stl/inc/format
Original file line number Diff line number Diff line change
Expand Up @@ -1018,30 +1018,6 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
#endif // ^^^ EDG workaround ^^^
}

// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
// in the https://github.com/microsoft/stl repository.

// EastAsianWidth-15.0.0.txt
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,
0x26BDu, 0x26BFu, 0x26C4u, 0x26C6u, 0x26CEu, 0x26CFu, 0x26D4u, 0x26D5u, 0x26EAu, 0x26EBu, 0x26F2u, 0x26F4u, 0x26F5u,
0x26F6u, 0x26FAu, 0x26FBu, 0x26FDu, 0x26FEu, 0x2705u, 0x2706u, 0x270Au, 0x270Cu, 0x2728u, 0x2729u, 0x274Cu, 0x274Du,
0x274Eu, 0x274Fu, 0x2753u, 0x2756u, 0x2757u, 0x2758u, 0x2795u, 0x2798u, 0x27B0u, 0x27B1u, 0x27BFu, 0x27C0u, 0x2B1Bu,
0x2B1Du, 0x2B50u, 0x2B51u, 0x2B55u, 0x2B56u, 0x2E80u, 0x2E9Au, 0x2E9Bu, 0x2EF4u, 0x2F00u, 0x2FD6u, 0x2FF0u, 0x2FFCu,
0x3000u, 0x303Fu, 0x3041u, 0x3097u, 0x3099u, 0x3100u, 0x3105u, 0x3130u, 0x3131u, 0x318Fu, 0x3190u, 0x31E4u, 0x31F0u,
0x321Fu, 0x3220u, 0x3248u, 0x3250u, 0xA48Du, 0xA490u, 0xA4C7u, 0xA960u, 0xA97Du, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u,
0xFE10u, 0xFE1Au, 0xFE30u, 0xFE53u, 0xFE54u, 0xFE67u, 0xFE68u, 0xFE6Cu, 0xFF01u, 0xFF61u, 0xFFE0u, 0xFFE7u,
0x16FE0u, 0x16FE5u, 0x16FF0u, 0x16FF2u, 0x17000u, 0x187F8u, 0x18800u, 0x18CD6u, 0x18D00u, 0x18D09u, 0x1AFF0u,
0x1AFF4u, 0x1AFF5u, 0x1AFFCu, 0x1AFFDu, 0x1AFFFu, 0x1B000u, 0x1B123u, 0x1B132u, 0x1B133u, 0x1B150u, 0x1B153u,
0x1B155u, 0x1B156u, 0x1B164u, 0x1B168u, 0x1B170u, 0x1B2FCu, 0x1F004u, 0x1F005u, 0x1F0CFu, 0x1F0D0u, 0x1F18Eu,
0x1F18Fu, 0x1F191u, 0x1F19Bu, 0x1F200u, 0x1F203u, 0x1F210u, 0x1F23Cu, 0x1F240u, 0x1F249u, 0x1F250u, 0x1F252u,
0x1F260u, 0x1F266u, 0x1F300u, 0x1F650u, 0x1F680u, 0x1F6C6u, 0x1F6CCu, 0x1F6CDu, 0x1F6D0u, 0x1F6D3u, 0x1F6D5u,
0x1F6D8u, 0x1F6DCu, 0x1F6E0u, 0x1F6EBu, 0x1F6EDu, 0x1F6F4u, 0x1F6FDu, 0x1F7E0u, 0x1F7ECu, 0x1F7F0u, 0x1F7F1u,
0x1F900u, 0x1FA00u, 0x1FA70u, 0x1FA7Du, 0x1FA80u, 0x1FA89u, 0x1FA90u, 0x1FABEu, 0x1FABFu, 0x1FAC6u, 0x1FACEu,
0x1FADCu, 0x1FAE0u, 0x1FAE9u, 0x1FAF0u, 0x1FAF9u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};

_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
// Computes the width estimation for Unicode characters from N4950 [format.string.std]/13
// The two branches are functionally equivalent; `12` is chosen for performance here.
Expand Down
53 changes: 47 additions & 6 deletions tools/unicode_properties_parse/grapheme_break_property_data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
}};
"""

INTERVALS_TEMPLATE = """
WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
{filename}
{timestamp}
inline constexpr char32_t _{prop_name}_ranges[{size}] = {{
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{
{data}
}};
"""
Expand Down Expand Up @@ -148,7 +148,6 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
#if _STL_COMPILER_PREPROCESSOR

#include <cstdint>
#include <limits>
#include <xutility>

#pragma pack(push, _CRT_PACKING)
Expand All @@ -166,7 +165,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]:
uint16_t _Props_and_size[_NumRanges];
_NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {{
ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds;
constexpr auto _No_value_constant = static_cast<_ValueEnum>((numeric_limits<uint8_t>::max)());
constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX);
if (_Upper_idx == 0) {{
return _No_value_constant;
}}
Expand Down Expand Up @@ -274,10 +273,22 @@ def read_file(filename: str) -> list[PropertyRange]:
return filename, timestamp, ranges


def generate_width_estimate_intervals(filename: str, timestamp: str, width_2_ranges: list[PropertyRange]):
values = []

for width_2_range in width_2_ranges:
values.append(width_2_range.lower)
values.append(width_2_range.upper + 1)

return WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
filename=filename, timestamp=timestamp, data=",".join(['0x' + format(x, 'x') for x in values]))


def generate_data_tables() -> str:
"""
Generate Unicode data for inclusion into <format> from
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, and DerivedCoreProperties.txt
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, DerivedCoreProperties.txt,
and EastAsianWidth.txt.

GraphemeBreakProperty.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
Expand All @@ -291,28 +302,58 @@ def generate_data_tables() -> str:
DerivedCoreProperties.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt

EastAsianWidth.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt

All files are expected to be in the same directory as this script.
"""
gbp_filename, gbp_timestamp, gbp_ranges = read_file("GraphemeBreakProperty.txt")
emoji_filename, emoji_timestamp, emoji_ranges = read_file("emoji-data.txt")
cat_filename, cat_timestamp, cat_ranges = read_file("DerivedGeneralCategory.txt")
derived_filename, derived_timestamp, derived_ranges = read_file("DerivedCoreProperties.txt")
eaw_filename, eaw_timestamp, eaw_ranges = read_file("EastAsianWidth.txt")

printable_ranges = compact_property_ranges(sorted([
PropertyRange(x.lower, x.upper, "Yes")
for x in cat_ranges
if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' '
], key=lambda x: x.lower))

std_wide_ranges = [
range(0x4DC0, 0x4DFF),
range(0x1F300, 0x1F5FF),
range(0x1F900, 0x1F9FF),
]

def has_width_2(prop_range):
if prop_range.prop in ("F", "W"):
return True

for std_wide_range in std_wide_ranges:
if prop_range.lower in std_wide_range:
assert prop_range.upper <= std_wide_range.stop

return True
else:
assert prop_range.upper not in std_wide_range

return False

width_2_ranges = compact_property_ranges(sorted([
PropertyRange(x.lower, x.upper, "Yes") for x in eaw_ranges if has_width_2(x)
], key=lambda x: x.lower))

gpb_cpp_data = generate_cpp_data(gbp_filename, gbp_timestamp, "Grapheme_Break", gbp_ranges)
emoji_cpp_data = generate_cpp_data(emoji_filename, emoji_timestamp, "Extended_Pictographic", [
x for x in emoji_ranges if x.prop == "Extended_Pictographic"])
# _printable follows a different naming scheme, to indicate that it is a fake Unicode property.
printable_cpp_data = generate_cpp_data(cat_filename, cat_timestamp, "_printable", printable_ranges)
grapheme_extend_cpp_data = generate_cpp_data(derived_filename, derived_timestamp, "Grapheme_Extend", [
x for x in derived_ranges if x.prop == "Grapheme_Extend"])
width_estimate_intervals = generate_width_estimate_intervals(eaw_filename, eaw_timestamp, width_2_ranges)

return "\n".join([gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data])
return "\n".join(
[gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data, width_estimate_intervals])


if __name__ == "__main__":
Expand Down