From b3ab633204757e8c670e7eb04a1755ab64e80a0c Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 13 May 2024 12:36:22 -0400 Subject: [PATCH] Support Lisu tone letters --- src/lib.rs | 57 +++++++++++++++++++++++++++++++------------------- tests/tests.rs | 24 +++++++++++++++++++++ 2 files changed, 59 insertions(+), 22 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 8acb8c7..ab8a8a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,8 +38,10 @@ //! - Has the [`Emoji_Presentation`] property, and //! - Is not in the [Enclosed Ideographic Supplement] block. //! 3. The sequence `"\r\n"` has width 1. -//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. -//! 5. The following have width 0: +//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'` +//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. +//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. +//! 6. The following have width 0: //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D) //! with the [`Default_Ignorable_Code_Point`] property. //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D) @@ -55,11 +57,11 @@ //! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43). //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D) //! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`). -//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) +//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) //! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. -//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D) +//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D) //! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise. -//! 8. All other characters have width 1. +//! 9. All other characters have width 1. //! //! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095 //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1 @@ -76,6 +78,8 @@ //! //! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf //! +//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078 +//! //! ## Canonical equivalence //! //! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width. @@ -185,8 +189,14 @@ impl UnicodeWidthStr for str { enum NextCharInfo { #[default] Default, + /// `'\n'` LineFeed = 0x0A, + /// `'\u{A4FC}'..='\u{A4FD}'` + /// + TrailingLisuToneLetter, + /// `'\u{FE0E}'` Vs15 = 0x0E, + /// `'\u{FE0F}'` Vs16 = 0x0F, } @@ -204,25 +214,28 @@ fn str_width(s: &str, is_cjk: bool) -> usize { /// they're treated as single width. #[inline] fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) { - match next_info { - NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => { - (1, NextCharInfo::Default) + if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) { + (2, NextCharInfo::Default) + } else if c <= '\u{A0}' { + match c { + '\n' => (1, NextCharInfo::LineFeed), + '\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default), + _ => (1, NextCharInfo::Default), } - NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default), - _ => { - if c <= '\u{A0}' { - match c { - '\n' => (1, NextCharInfo::LineFeed), - '\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default), - _ => (1, NextCharInfo::Default), - } - } else { - match c { - '\u{FE0E}' => (0, NextCharInfo::Vs15), - '\u{FE0F}' => (0, NextCharInfo::Vs16), - _ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default), - } + } else { + match (c, next_info) { + ('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => { + (0, NextCharInfo::Default) + } + ('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter), + ('\u{FE0E}', _) => (0, NextCharInfo::Vs15), + ('\u{FE0F}', _) => (0, NextCharInfo::Vs16), + (_, NextCharInfo::Vs15) + if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => + { + (1, NextCharInfo::Default) } + _ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default), } } } diff --git a/tests/tests.rs b/tests/tests.rs index a6835b9..57c7aed 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -234,3 +234,27 @@ fn char_str_consistent() { assert_eq!(c.width().unwrap_or(1), s.width()) } } + +#[test] +fn test_lisu_tones() { + for c in '\u{A4F8}'..='\u{A4FD}' { + assert_eq!(c.width(), Some(1)); + assert_eq!(String::from(c).width(), 1); + } + for c1 in '\u{A4F8}'..='\u{A4FD}' { + for c2 in '\u{A4F8}'..='\u{A4FD}' { + let mut s = String::with_capacity(8); + s.push(c1); + s.push(c2); + match (c1, c2) { + ('\u{A4F8}'..='\u{A4FB}', '\u{A4FC}'..='\u{A4FD}') => assert_eq!(s.width(), 1), + _ => assert_eq!(s.width(), 2), + } + } + } + + assert_eq!("ꓪꓹ".width(), 2); + assert_eq!("ꓪꓹꓼ".width(), 2); + assert_eq!("ꓪꓹꓹ".width(), 3); + assert_eq!("ꓪꓼꓼ".width(), 3); +}