diff --git a/scripts/unicode.py b/scripts/unicode.py index 7b2601e..605edad 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -123,9 +123,9 @@ def load_east_asian_widths() -> "list[EffectiveWidth]": `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`.""" with fetch_open("EastAsianWidth.txt") as eaw: # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." - single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") + single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." - multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") + multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") # map between width category code and condensed width width_codes = { **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]}, @@ -189,10 +189,10 @@ def load_zero_widths() -> "list[bool]": # canonically equivalent sequences have the same width. with fetch_open("DerivedCoreProperties.txt") as properties: single = re.compile( - r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+" + r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+" ) multiple = re.compile( - r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+" + r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+" ) for line in properties.readlines(): @@ -225,8 +225,8 @@ def load_zero_widths() -> "list[bool]": # # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul) with fetch_open("HangulSyllableType.txt") as categories: - single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+") - multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+") + single = re.compile(r"^([0-9A-F]+)\s*;\s*(V|T)\s+") + multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+") for line in categories.readlines(): raw_data = None # (low, high) @@ -396,14 +396,14 @@ def make_tables( return tables -def load_variation_sequences() -> "list[int]": +def load_emoji_presentation_sequences() -> "list[int]": """Outputs a list of character ranages, corresponding to all the valid characters for starting an emoji presentation sequence.""" with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: # Match all emoji presentation sequences # (one codepoint followed by U+FE0F, and labeled "emoji style") - sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style") + sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style") codepoints = [] for line in sequences.readlines(): if match := sequence.match(line): @@ -412,12 +412,58 @@ def load_variation_sequences() -> "list[int]": return codepoints -def make_variation_sequence_table( +def load_text_presentation_sequences() -> "list[int]": + """Outputs a list of character ranages, corresponding to all the valid characters + whose widths change with a text presentation sequence.""" + + text_presentation_seq_codepoints = set() + with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: + # Match all text presentation sequences + # (one codepoint followed by U+FE0E, and labeled "text style") + sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style") + for line in sequences.readlines(): + if match := sequence.match(line): + cp = int(match.group(1), 16) + text_presentation_seq_codepoints.add(cp) + + default_emoji_codepoints = set() + with fetch_open("emoji/emoji-data.txt") as emoji_data: + single = re.compile(r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+") + multiple = re.compile( + r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+" + ) + + for line in emoji_data.readlines(): + raw_data = None # (low, high) + if match := single.match(line): + raw_data = (match.group(1), match.group(1)) + elif match := multiple.match(line): + raw_data = (match.group(1), match.group(2)) + else: + continue + low = int(raw_data[0], 16) + high = int(raw_data[1], 16) + for cp in range(low, high + 1): + default_emoji_codepoints.add(cp) + + codepoints = [] + for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints): + # "Enclosed Ideographic Supplement" block; + # wide even in text presentation + if not cp in range(0x1F200, 0x1F300): + codepoints.append(cp) + + codepoints.sort() + return codepoints + + +def make_presentation_sequence_table( seqs: "list[int]", width_map: "list[EffectiveWidth]", -) -> "tuple[list[int], list[list[int]]]": - """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence. - (Characters that are always wide may be excluded.) + spurious_false: "set[EffectiveWidth]", + spurious_true: "set[EffectiveWidth]", +) -> "tuple[list[tuple[int, int]], list[list[int]]]": + """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence. The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB. """ @@ -425,34 +471,46 @@ def make_variation_sequence_table( for cp in seqs: prefixes_dict[cp >> 10].add(cp & 0x3FF) - # We don't strictly need to keep track of characters that are always wide, - # because being in an emoji variation seq won't affect their width. - # So store their info only when it wouldn't inflate the size of the tables. for k in list(prefixes_dict.keys()): if all( map( - lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE, + lambda cp: width_map[(k << 10) | cp] in spurious_false, prefixes_dict[k], ) ): del prefixes_dict[k] - indexes = list(prefixes_dict.keys()) + msbs: "list[int]" = list(prefixes_dict.keys()) - # Similarly, we can spuriously return `true` for always-wide characters - # even if not part of a presentation seq; this saves an additional lookup, - # so we should do it where there is no size cost. for cp, width in enumerate(width_map): - if width == EffectiveWidth.WIDE and (cp >> 10) in indexes: + if width in spurious_true and (cp >> 10) in msbs: prefixes_dict[cp >> 10].add(cp & 0x3FF) - leaves = [] + leaves: "list[list[int]]" = [] for cps in prefixes_dict.values(): leaf = [0] * 128 for cp in cps: idx_in_leaf, bit_shift = divmod(cp, 8) leaf[idx_in_leaf] |= 1 << bit_shift leaves.append(leaf) + + indexes = [(msb, index) for (index, msb) in enumerate(msbs)] + + # Cull duplicate leaves + i = 0 + while i < len(leaves): + first_idx = leaves.index(leaves[i]) + if first_idx == i: + i += 1 + else: + for j in range(0, len(indexes)): + if indexes[j][1] == i: + indexes[j] = (indexes[j][0], first_idx) + elif indexes[j][1] > i: + indexes[j] = (indexes[j][0], indexes[j][1] - 1) + + leaves.pop(i) + return (indexes, leaves) @@ -460,7 +518,8 @@ def emit_module( out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]", - variation_table: "tuple[list[int], list[list[int]]]", + emoji_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]", + text_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]", ): """Outputs a Rust module to `out_name` using table data from `tables`. If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. @@ -537,7 +596,8 @@ def emit_module( """ ) - variation_idx, variation_leaves = variation_table + emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table + text_presentation_idx, text_presentation_leaves = text_presentation_table module.write( """ @@ -555,7 +615,7 @@ def emit_module( """ ) - for i, msbs in enumerate(variation_idx): + for msbs, i in emoji_presentation_idx: module.write(f" {msbs} => {i},\n") module.write( @@ -571,6 +631,39 @@ def emit_module( """ ) + module.write( + """ + /// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence] + /// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence) + /// when followed by `'\\u{FEOE}'`, and is not ideographic. + /// Such sequences are considered to have width 1. + /// + /// This may spuriously return `true` for characters of narrow or ambiguous width. + #[inline] + pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool { + let cp: u32 = c.into(); + // First level of lookup uses all but 10 LSB + let top_bits = cp >> 10; + let idx_of_leaf: usize = match top_bits { +""" + ) + + for msbs, i in text_presentation_idx: + module.write(f" {msbs} => {i},\n") + + module.write( + """ _ => return false, + }; + // Extract the 3-9th (0-indexed) least significant bits of `cp`, + // and use them to index into `leaf_row`. + let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); + let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; + // Use the 3 LSB of `cp` to index into `leaf_byte`. + ((leaf_byte >> (cp & 7)) & 1) == 1 + } +""" + ) + module.write( """ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or @@ -626,12 +719,32 @@ def emit_module( f""" #[repr(align(128))] struct Align128(T); - /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) - /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. - static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([ + /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint + /// to get whether it can start an emoji presentation sequence. + static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([ +""" + ) + for leaf in emoji_presentation_leaves: + module.write(" [\n") + for row in batched(leaf, 14): + module.write(" ") + for entry in row: + module.write(f" 0x{entry:02X},") + module.write("\n") + module.write(" ],\n") + + module.write(" ]);\n") + + # text table + + module.write( + f""" + /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint + /// to get whether it can start a text presentation sequence. + static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(text_presentation_leaves)}]> = Align128([ """ ) - for leaf in variation_leaves: + for leaf in text_presentation_leaves: module.write(" [\n") for row in batched(leaf, 14): module.write(" ") @@ -650,21 +763,7 @@ def main(module_path: str): lookup table for character width, and write a Rust module utilizing that table to `module_filename`. - We obey the following rules, in decreasing order of importance: - - - Emoji presentation sequences are double-width. - - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c) - - Hangul jamo medial vowels & final consonants are zero-width. - - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. - - Control characters are zero-width. - - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters, - are zero-width. - - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width. - - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. - - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width - of `Neutral`, `Narrow`, or `Halfwidth`) are single-width. - - These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations. + See `lib.rs` for documentation of the exact width rules. """ version = load_unicode_version() print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") @@ -682,8 +781,18 @@ def main(module_path: str): tables = make_tables(TABLE_CFGS, enumerate(width_map)) - emoji_variations = load_variation_sequences() - variation_table = make_variation_sequence_table(emoji_variations, width_map) + emoji_presentations = load_emoji_presentation_sequences() + emoji_presentation_table = make_presentation_sequence_table( + emoji_presentations, width_map, {EffectiveWidth.WIDE}, {EffectiveWidth.WIDE} + ) + + text_presentations = load_text_presentation_sequences() + text_presentation_table = make_presentation_sequence_table( + text_presentations, + width_map, + set(), + {EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS}, + ) # Download normalization test file for use by tests fetch_open("NormalizationTest.txt", "../tests/") @@ -694,16 +803,23 @@ def main(module_path: str): size_bytes = len(table.to_bytes()) print(f"Table {i} size: {size_bytes} bytes") total_size += size_bytes - emoji_index_size = len(variation_table[0]) * 4 - print(f"Emoji presentation index size: {emoji_index_size} bytes") - total_size += emoji_index_size - emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0]) - print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes") - total_size += emoji_leaves_size + + for s, table in [ + ("Emoji", emoji_presentation_table), + ("Text", text_presentation_table), + ]: + index_size = len(table[0]) * 4 + print(f"{s} presentation index size: {index_size} bytes") + total_size += index_size + leaves_size = len(table[1]) * len(table[1][0]) + print(f"{s} presentation leaves size: {leaves_size} bytes") + total_size += leaves_size print("------------------------") print(f" Total size: {total_size} bytes") - emit_module(module_path, version, tables, variation_table) + emit_module( + module_path, version, tables, emoji_presentation_table, text_presentation_table + ) print(f'Wrote to "{module_path}"') diff --git a/src/lib.rs b/src/lib.rs index 339d795..a31c7ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,16 +33,20 @@ //! This crate currently uses the following rules to determine the width of a //! character or string, in order of decreasing precedence. These may be tweaked in the future. //! -//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence) -//! have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.) -//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1. -//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. -//! 4. The following have width 0: +//! 1. [Emoji presentation sequences] have width 2. +//! (The width of a string may therefore differ from the sum of the widths of its characters.) +//! 2. Outside of an East Asian context, [text presentation sequences] have width 1 +//! iff their base character fulfills all the following requirements: +//! - Has the [`Emoji_Presentation`] property, and +//! - Not in the [Enclosed Ideographic Supplement] block. +//! 3. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1. +//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. +//! 5. The following have width 0: //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D) //! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property. //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D) //! with the [`Grapheme_Extend`] property. -//! - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] chracters: +//! - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] characters: //! - [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0), //! - [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7), //! - [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8), @@ -52,22 +56,29 @@ //! - [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and //! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43). //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D) -//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593) -//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`). +//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`). //! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000). -//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D) +//! 6. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D) //! have no defined width, and are ignored when determining the width of a string. -//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) -//! with an [`East_Asian_Width`] of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2) -//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2. -//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D) -//! with an [`East_Asian_Width`] of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6) -//! have width 2 in an East Asian context, and width 1 otherwise. -//! 8. All other characters have width 1. +//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) +//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. +//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D) +//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise. +//! 9. All other characters have width 1. //! //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1 +//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443 - +//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593 +//! +//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2 +//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4 +//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6 +//! +//! [Emoji presentation sequences]: (https://unicode.org/reports/tr51/#def_emoji_presentation_sequence) +//! [text presentation sequences]: (https://unicode.org/reports/tr51/#def_text_presentation_sequence) +//! +//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf //! //! ## Canonical equivalence //! @@ -159,18 +170,29 @@ impl UnicodeWidthStr for str { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum VariationSelector { + Vs15 = 0x0E, + Vs16 = 0x0F, +} + fn str_width(s: &str, is_cjk: bool) -> usize { s.chars() - .rfold((0, false), |(sum, was_fe0f), c| { - if c == '\u{FE0F}' { - (sum, true) - } else { - let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) { - 2 - } else { - cw::width(c, is_cjk).unwrap_or(0) + .rfold((0, None), |(sum, vsel), c| match c { + '\u{FE0E}' => (sum, Some(VariationSelector::Vs15)), + '\u{FE0F}' => (sum, Some(VariationSelector::Vs16)), + _ => { + let add = match vsel { + Some(VariationSelector::Vs15) + if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => + { + 1 + } + + Some(VariationSelector::Vs16) if cw::starts_emoji_presentation_seq(c) => 2, + _ => cw::width(c, is_cjk).unwrap_or(0), }; - (sum + add, false) + (sum + add, None) } }) .0 diff --git a/src/tables.rs b/src/tables.rs index 2bdc7b3..f2aff59 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -84,6 +84,33 @@ pub mod charwidth { ((leaf_byte >> (cp & 7)) & 1) == 1 } + /// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence] + /// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence) + /// when followed by `'\u{FEOE}'`, and is not ideographic. + /// Such sequences are considered to have width 1. + /// + /// This may spuriously return `true` for characters of narrow or ambiguous width. + #[inline] + pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool { + let cp: u32 = c.into(); + // First level of lookup uses all but 10 LSB + let top_bits = cp >> 10; + let idx_of_leaf: usize = match top_bits { + 8 => 0, + 9 => 1, + 10 => 1, + 124 => 2, + 125 => 3, + _ => return false, + }; + // Extract the 3-9th (0-indexed) least significant bits of `cp`, + // and use them to index into `leaf_row`. + let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); + let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; + // Use the 3 LSB of `cp` to index into `leaf_byte`. + ((leaf_byte >> (cp & 7)) & 1) == 1 + } + /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or /// `None` if `c` is a control character other than `'\x00'`. /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, @@ -568,8 +595,8 @@ pub mod charwidth { #[repr(align(128))] struct Align128(T); - /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) - /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. + /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint + /// to get whether it can start an emoji presentation sequence. static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; 6]> = Align128([ [ 0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -644,4 +671,57 @@ pub mod charwidth { 0x01, 0x00, ], ]); + + /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint + /// to get whether it can start a text presentation sequence. + static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128]; 4]> = Align128([ + [ + 0xFF, 0x07, 0xFF, 0xFF, 0xFF, 0x83, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, + 0x00, 0x00, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xF9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, + ], + [ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, + ], + [ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBF, 0x01, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF8, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x00, 0xF0, 0x00, 0xFE, 0xFC, 0xFF, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0xE0, + 0x20, 0x10, 0xFE, 0x1F, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, + 0x00, 0x00, 0xF8, 0xFF, 0x80, 0x70, 0x00, 0x00, 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20, + 0xEE, 0x00, + ], + [ + 0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04, + 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84, + 0x70, 0x3C, 0x80, 0x6E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0xC0, 0xFF, 0x87, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFB, 0xFF, 0xFF, 0x9F, 0xFF, 0xEF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x07, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x20, 0x12, 0x01, + 0x00, 0x20, 0x04, 0x16, 0xC0, 0xEF, 0x18, 0x0F, 0xFF, 0xE7, 0x0F, 0xE0, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0xF0, + 0xFE, 0xFF, + ], + ]); } diff --git a/tests/tests.rs b/tests/tests.rs index 5c70c6b..a8a5922 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -198,3 +198,24 @@ fn test_emoji_presentation() { assert_eq!(UnicodeWidthStr::width("\u{1F6F3}\u{FE0F}"), 2); assert_eq!(UnicodeWidthStr::width("\u{1F700}\u{FE0F}"), 1); } + +#[test] +fn test_text_presentation() { + assert_eq!('\u{FE0E}'.width(), Some(0)); + + assert_eq!('\u{2648}'.width(), Some(2)); + assert_eq!("\u{2648}\u{FE0E}".width(), 1); + assert_eq!("\u{2648}\u{FE0E}".width_cjk(), 2); + + assert_eq!("\u{1F21A}\u{FE0E}".width(), 2); + assert_eq!("\u{1F21A}\u{FE0E}".width_cjk(), 2); + + assert_eq!("\u{0301}\u{FE0E}".width(), 0); + assert_eq!("\u{0301}\u{FE0E}".width_cjk(), 0); + + assert_eq!("a\u{FE0E}".width(), 1); + assert_eq!("a\u{FE0E}".width_cjk(), 1); + + assert_eq!("𘀀\u{FE0E}".width(), 2); + assert_eq!("𘀀\u{FE0E}".width_cjk(), 2); +}