diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 3f391fe2c1de8..55f9f255e8b4b 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -146,6 +146,7 @@ #![feature(std_internals)] #![feature(str_internals)] #![feature(temporary_niche_types)] +#![feature(titlecase)] #![feature(transmutability)] #![feature(trivial_clone)] #![feature(trusted_fused)] diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index d1de2c5606154..660d9eac60704 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -777,8 +777,73 @@ impl char { #[inline] pub fn is_alphabetic(self) -> bool { match self { - 'a'..='z' | 'A'..='Z' => true, - c => c > '\x7f' && unicode::Alphabetic(c), + 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Alphabetic(self), + } + } + + /// Returns `true` if this `char` has the `Cased` property. + /// A character is cased if and only if it is uppercase, lowercase, or titlecase. + /// + /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and + /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(titlecase)] + /// assert!('A'.is_cased()); + /// assert!('a'.is_cased()); + /// assert!(!'京'.is_cased()); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn is_cased(self) -> bool { + match self { + 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Cased(self), + } + } + + /// Returns the case of this character: + /// [`Some(CharCase::Upper)`][`CharCase::Upper`] if [`self.is_uppercase()`][`char::is_uppercase`], + /// [`Some(CharCase::Lower)`][`CharCase::Lower`] if [`self.is_lowercase()`][`char::is_lowercase`], + /// [`Some(CharCase::Title)`][`CharCase::Title`] if [`self.is_titlecase()`][`char::is_titlecase`], and + /// `None` if [`!self.is_cased()`][`char::is_cased`]. + /// + /// # Examples + /// + /// ``` + /// #![feature(titlecase)] + /// use core::char::CharCase; + /// assert_eq!('a'.case(), Some(CharCase::Lower)); + /// assert_eq!('δ'.case(), Some(CharCase::Lower)); + /// assert_eq!('A'.case(), Some(CharCase::Upper)); + /// assert_eq!('Δ'.case(), Some(CharCase::Upper)); + /// assert_eq!('Dž'.case(), Some(CharCase::Title)); + /// assert_eq!('中'.case(), None); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn case(self) -> Option { + match self { + 'A'..='Z' => Some(CharCase::Upper), + 'a'..='z' => Some(CharCase::Lower), + '\0'..='\u{A9}' => None, + _ if !unicode::Cased(self) => None, + _ if unicode::Lowercase(self) => Some(CharCase::Lower), + _ if unicode::Uppercase(self) => Some(CharCase::Upper), + _ => Some(CharCase::Title), } } @@ -819,7 +884,41 @@ impl char { pub const fn is_lowercase(self) -> bool { match self { 'a'..='z' => true, - c => c > '\x7f' && unicode::Lowercase(c), + '\0'..='\u{A9}' => false, + _ => unicode::Lowercase(self), + } + } + + /// Returns `true` if this `char` has the general category for titlecase letters. + /// + /// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4 + /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character + /// Database][ucd] [`UnicodeData.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(titlecase)] + /// assert!('Dž'.is_titlecase()); + /// assert!('ᾨ'.is_titlecase()); + /// assert!(!'D'.is_titlecase()); + /// assert!(!'z'.is_titlecase()); + /// assert!(!'中'.is_titlecase()); + /// assert!(!' '.is_titlecase()); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn is_titlecase(self) -> bool { + match self { + '\0'..='\u{01C4}' => false, + _ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(), } } @@ -860,7 +959,8 @@ impl char { pub const fn is_uppercase(self) -> bool { match self { 'A'..='Z' => true, - c => c > '\x7f' && unicode::Uppercase(c), + '\0'..='\u{BF}' => false, + _ => unicode::Uppercase(self), } } @@ -893,7 +993,8 @@ impl char { pub const fn is_whitespace(self) -> bool { match self { ' ' | '\x09'..='\x0d' => true, - c => c > '\x7f' && unicode::White_Space(c), + '\0'..='\u{84}' => false, + _ => unicode::White_Space(self), } } @@ -920,10 +1021,10 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_alphanumeric(self) -> bool { - if self.is_ascii() { - self.is_ascii_alphanumeric() - } else { - unicode::Alphabetic(self) || unicode::N(self) + match self { + '0'..='9' | 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Alphabetic(self) || unicode::N(self), } } @@ -969,23 +1070,7 @@ impl char { #[must_use] #[inline] pub(crate) fn is_grapheme_extended(self) -> bool { - !self.is_ascii() && unicode::Grapheme_Extend(self) - } - - /// Returns `true` if this `char` has the `Cased` property. - /// - /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and - /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. - /// - /// [Unicode Standard]: https://www.unicode.org/versions/latest/ - /// [ucd]: https://www.unicode.org/reports/tr44/ - /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt - #[must_use] - #[inline] - #[doc(hidden)] - #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] - pub fn is_cased(self) -> bool { - if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) } + self > '\u{02FF}' && unicode::Grapheme_Extend(self) } /// Returns `true` if this `char` has the `Case_Ignorable` property. @@ -1047,12 +1132,14 @@ impl char { pub fn is_numeric(self) -> bool { match self { '0'..='9' => true, - c => c > '\x7f' && unicode::N(c), + '\0'..='\u{B1}' => false, + _ => unicode::N(self), } } /// Returns an iterator that yields the lowercase mapping of this `char` as one or more - /// `char`s. + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. /// /// If this `char` does not have a lowercase mapping, the iterator yields the same `char`. /// @@ -1110,7 +1197,14 @@ impl char { /// // convert into themselves. /// assert_eq!('山'.to_lowercase().to_string(), "山"); /// ``` - #[must_use = "this returns the lowercase character as a new iterator, \ + /// + /// Check if a string is in lowercase: + /// + /// ``` + /// let s = "abcde\u{0301} 山"; + /// assert!(s.chars().all(|c| c.to_lowercase() == c)); + /// ``` + #[must_use = "this returns the lowercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] @@ -1118,8 +1212,123 @@ impl char { ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) } + /// Returns an iterator that yields the titlecase mapping of this `char` as one or more + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. + /// + /// If this `char` does not have an titlecase mapping, the iterator yields the same `char`. + /// + /// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character + /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. + /// + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields + /// the `char`(s) given by [`SpecialCasing.txt`]. + /// + /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt + /// + /// This operation performs an unconditional mapping without tailoring. That is, the conversion + /// is independent of context and language. + /// + /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in + /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// + /// # Examples + /// + /// As an iterator: + /// + /// ``` + /// #![feature(titlecase)] + /// for c in 'ß'.to_titlecase() { + /// print!("{c}"); + /// } + /// println!(); + /// ``` + /// + /// Using `println!` directly: + /// + /// ``` + /// #![feature(titlecase)] + /// println!("{}", 'ß'.to_titlecase()); + /// ``` + /// + /// Both are equivalent to: + /// + /// ``` + /// #![feature(titlecase)] + /// println!("Ss"); + /// ``` + /// + /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): + /// + /// ``` + /// #![feature(titlecase)] + /// assert_eq!('c'.to_titlecase().to_string(), "C"); + /// + /// // Sometimes the result is more than one character: + /// assert_eq!('ß'.to_titlecase().to_string(), "Ss"); + /// + /// // Characters that do not have separate cased forms + /// // convert into themselves. + /// assert_eq!('山'.to_titlecase().to_string(), "山"); + /// ``` + /// + /// Check if a word is in titlecase: + /// + /// ``` + /// #![feature(titlecase)] + /// let word = "Dross"; + /// let mut chars = word.chars(); + /// let first_cased_char = chars.find(|c| c.is_cased()); + /// let word_is_in_titlecase = if let Some(f) = first_cased_char { + /// f.to_titlecase() == f && chars.all(|c| c.to_lowercase() == c) + /// } else { + /// true + /// }; + /// assert!(word_is_in_titlecase); + /// ``` + /// + /// # Note on locale + /// + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: + /// + /// * 'Dotless': I / ı, sometimes written ï + /// * 'Dotted': İ / i + /// + /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: + /// + /// ``` + /// #![feature(titlecase)] + /// let upper_i = 'i'.to_titlecase().to_string(); + /// ``` + /// + /// The value of `upper_i` here relies on the language of the text: if we're + /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should + /// be `"İ"`. `to_titlecase()` does not take this into account, and so: + /// + /// ``` + /// #![feature(titlecase)] + /// let upper_i = 'i'.to_titlecase().to_string(); + /// + /// assert_eq!(upper_i, "I"); + /// ``` + /// + /// holds across languages. + #[must_use = "this returns the titlecased character as a new iterator, \ + without modifying the original"] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn to_titlecase(self) -> ToTitlecase { + ToTitlecase(CaseMappingIter::new(conversions::to_title(self))) + } + /// Returns an iterator that yields the uppercase mapping of this `char` as one or more - /// `char`s. + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. /// /// If this `char` does not have an uppercase mapping, the iterator yields the same `char`. /// @@ -1178,9 +1387,16 @@ impl char { /// assert_eq!('山'.to_uppercase().to_string(), "山"); /// ``` /// + /// Check if a string is in uppercase: + /// + /// ``` + /// let s = "ABCDE\u{0301} 山"; + /// assert!(s.chars().all(|c| c.to_uppercase() == c)); + /// ``` + /// /// # Note on locale /// - /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: /// /// * 'Dotless': I / ı, sometimes written ï /// * 'Dotted': İ / i @@ -1192,7 +1408,7 @@ impl char { /// ``` /// /// The value of `upper_i` here relies on the language of the text: if we're - /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should + /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should /// be `"İ"`. `to_uppercase()` does not take this into account, and so: /// /// ``` @@ -1202,7 +1418,7 @@ impl char { /// ``` /// /// holds across languages. - #[must_use = "this returns the uppercase character as a new iterator, \ + #[must_use = "this returns the uppercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index 82a3f6f916be3..2deae0ea82b9f 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -363,13 +363,22 @@ impl fmt::Display for EscapeDebug { } macro_rules! casemappingiter_impls { - ($(#[$attr:meta])* $ITER_NAME:ident) => { + ( + #[$stab:meta] + #[$dendstab:meta] + #[$fusedstab:meta] + #[$exactstab:meta] + #[$displaystab:meta] + #[$partialstab:meta] + $(#[$attr:meta])* + $ITER_NAME:ident + ) => { $(#[$attr])* - #[stable(feature = "rust1", since = "1.0.0")] + #[$stab] #[derive(Debug, Clone)] pub struct $ITER_NAME(CaseMappingIter); - #[stable(feature = "rust1", since = "1.0.0")] + #[$stab] impl Iterator for $ITER_NAME { type Item = char; fn next(&mut self) -> Option { @@ -405,7 +414,7 @@ macro_rules! casemappingiter_impls { } } - #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[$dendstab] impl DoubleEndedIterator for $ITER_NAME { fn next_back(&mut self) -> Option { self.0.next_back() @@ -423,10 +432,10 @@ macro_rules! casemappingiter_impls { } } - #[stable(feature = "fused", since = "1.26.0")] + #[$fusedstab] impl FusedIterator for $ITER_NAME {} - #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[$exactstab] impl ExactSizeIterator for $ITER_NAME { fn len(&self) -> usize { self.0.len() @@ -453,34 +462,94 @@ macro_rules! casemappingiter_impls { #[unstable(feature = "std_internals", issue = "none")] unsafe impl TrustedRandomAccess for $ITER_NAME {} - #[stable(feature = "char_struct_display", since = "1.16.0")] + #[$displaystab] impl fmt::Display for $ITER_NAME { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) } } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToUppercase) -> bool { + self.0 == other.0 + } + } + + #[unstable(feature = "titlecase", issue = "none")] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToTitlecase) -> bool { + self.0 == other.0 + } + } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToLowercase) -> bool { + self.0 == other.0 + } + } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0 == *other + } + } } } casemappingiter_impls! { - /// Returns an iterator that yields the lowercase equivalent of a `char`. + #[stable(feature = "rust1", since = "1.0.0")] + #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[stable(feature = "fused", since = "1.26.0")] + #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[stable(feature = "char_struct_display", since = "1.16.0")] + #[stable(feature = "iter_partialeq", since = "CURRENT_RUSTC_VERSION")] + /// Returns an iterator that yields the uppercase equivalent of a `char`. /// - /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See + /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See /// its documentation for more. /// - /// [`to_lowercase`]: char::to_lowercase - ToLowercase + /// [`to_uppercase`]: char::to_uppercase + ToUppercase } casemappingiter_impls! { - /// Returns an iterator that yields the uppercase equivalent of a `char`. + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + /// Returns an iterator that yields the titlecase equivalent of a `char`. /// - /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See + /// This `struct` is created by the [`to_titlecase`] method on [`char`]. See /// its documentation for more. /// - /// [`to_uppercase`]: char::to_uppercase - ToUppercase + /// [`to_titlecase`]: char::to_titlecase + ToTitlecase +} + +casemappingiter_impls! { + #[stable(feature = "rust1", since = "1.0.0")] + #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[stable(feature = "fused", since = "1.26.0")] + #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[stable(feature = "char_struct_display", since = "1.16.0")] + #[stable(feature = "iter_partialeq", since = "CURRENT_RUSTC_VERSION")] + /// Returns an iterator that yields the lowercase equivalent of a `char`. + /// + /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See + /// its documentation for more. + /// + /// [`to_lowercase`]: char::to_lowercase + ToLowercase } #[derive(Debug, Clone)] @@ -589,6 +658,22 @@ impl fmt::Display for CaseMappingIter { } } +impl PartialEq for CaseMappingIter { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.0.as_slice() == other.0.as_slice() + } +} + +impl Eq for CaseMappingIter {} + +impl PartialEq for CaseMappingIter { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0.as_slice() == &[*other] + } +} + /// The error type returned when a checked char conversion fails. #[stable(feature = "u8_from_char", since = "1.59.0")] #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -603,3 +688,16 @@ impl fmt::Display for TryFromCharError { #[stable(feature = "u8_from_char", since = "1.59.0")] impl Error for TryFromCharError {} + +/// The case of a cased character, +/// as returned by [`char::case`]. +#[unstable(feature = "titlecase", issue = "none")] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum CharCase { + /// Lowercase. Corresponds to the `Lowercase` Unicode property. + Lower = 0b00, + /// Titlecase. Corresponds to the `Titlecase_Letter` Unicode general category. + Title = 0b10, + /// Uppercase. Corresponds to the `Uppercase` Unicode property. + Upper = 0b11, +} diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index c71fa754e68fb..0b6a055ba167e 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -4,12 +4,12 @@ // for use in alloc, not re-exported in std. #[rustfmt::skip] -pub use unicode_data::case_ignorable::lookup as Case_Ignorable; -pub use unicode_data::cased::lookup as Cased; pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; +pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; +pub(crate) use unicode_data::cased::lookup as Cased; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::n::lookup as N; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 3c38b44224f87..9006b7a1a47b8 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -794,6 +794,23 @@ pub mod conversions { } } + pub fn to_title(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_uppercase() as char, '\0', '\0'] + } else { + TITLECASE_TABLE + .binary_search_by(|&(key, _)| key.cmp(&c)) + .map(|i| { + let u = TITLECASE_TABLE[i].1; + char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { + // SAFETY: Index comes from statically generated table + unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + }) + }) + .unwrap_or(to_upper(c)) + } + } + static LOWERCASE_TABLE: &[(char, u32); 1462] = &[ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), @@ -1585,4 +1602,53 @@ pub mod conversions { ['\u{544}', '\u{53b}', '\u{0}'], ['\u{54e}', '\u{546}', '\u{0}'], ['\u{544}', '\u{53d}', '\u{0}'], ]; + + static TITLECASE_TABLE: &[(char, u32); 135] = &[ + ('\u{df}', 4194304), ('\u{1c4}', 453), ('\u{1c5}', 453), ('\u{1c6}', 453), + ('\u{1c7}', 456), ('\u{1c8}', 456), ('\u{1c9}', 456), ('\u{1ca}', 459), ('\u{1cb}', 459), + ('\u{1cc}', 459), ('\u{1f1}', 498), ('\u{1f2}', 498), ('\u{1f3}', 498), + ('\u{587}', 4194305), ('\u{10d0}', 4304), ('\u{10d1}', 4305), ('\u{10d2}', 4306), + ('\u{10d3}', 4307), ('\u{10d4}', 4308), ('\u{10d5}', 4309), ('\u{10d6}', 4310), + ('\u{10d7}', 4311), ('\u{10d8}', 4312), ('\u{10d9}', 4313), ('\u{10da}', 4314), + ('\u{10db}', 4315), ('\u{10dc}', 4316), ('\u{10dd}', 4317), ('\u{10de}', 4318), + ('\u{10df}', 4319), ('\u{10e0}', 4320), ('\u{10e1}', 4321), ('\u{10e2}', 4322), + ('\u{10e3}', 4323), ('\u{10e4}', 4324), ('\u{10e5}', 4325), ('\u{10e6}', 4326), + ('\u{10e7}', 4327), ('\u{10e8}', 4328), ('\u{10e9}', 4329), ('\u{10ea}', 4330), + ('\u{10eb}', 4331), ('\u{10ec}', 4332), ('\u{10ed}', 4333), ('\u{10ee}', 4334), + ('\u{10ef}', 4335), ('\u{10f0}', 4336), ('\u{10f1}', 4337), ('\u{10f2}', 4338), + ('\u{10f3}', 4339), ('\u{10f4}', 4340), ('\u{10f5}', 4341), ('\u{10f6}', 4342), + ('\u{10f7}', 4343), ('\u{10f8}', 4344), ('\u{10f9}', 4345), ('\u{10fa}', 4346), + ('\u{10fd}', 4349), ('\u{10fe}', 4350), ('\u{10ff}', 4351), ('\u{1f80}', 8072), + ('\u{1f81}', 8073), ('\u{1f82}', 8074), ('\u{1f83}', 8075), ('\u{1f84}', 8076), + ('\u{1f85}', 8077), ('\u{1f86}', 8078), ('\u{1f87}', 8079), ('\u{1f88}', 8072), + ('\u{1f89}', 8073), ('\u{1f8a}', 8074), ('\u{1f8b}', 8075), ('\u{1f8c}', 8076), + ('\u{1f8d}', 8077), ('\u{1f8e}', 8078), ('\u{1f8f}', 8079), ('\u{1f90}', 8088), + ('\u{1f91}', 8089), ('\u{1f92}', 8090), ('\u{1f93}', 8091), ('\u{1f94}', 8092), + ('\u{1f95}', 8093), ('\u{1f96}', 8094), ('\u{1f97}', 8095), ('\u{1f98}', 8088), + ('\u{1f99}', 8089), ('\u{1f9a}', 8090), ('\u{1f9b}', 8091), ('\u{1f9c}', 8092), + ('\u{1f9d}', 8093), ('\u{1f9e}', 8094), ('\u{1f9f}', 8095), ('\u{1fa0}', 8104), + ('\u{1fa1}', 8105), ('\u{1fa2}', 8106), ('\u{1fa3}', 8107), ('\u{1fa4}', 8108), + ('\u{1fa5}', 8109), ('\u{1fa6}', 8110), ('\u{1fa7}', 8111), ('\u{1fa8}', 8104), + ('\u{1fa9}', 8105), ('\u{1faa}', 8106), ('\u{1fab}', 8107), ('\u{1fac}', 8108), + ('\u{1fad}', 8109), ('\u{1fae}', 8110), ('\u{1faf}', 8111), ('\u{1fb2}', 4194306), + ('\u{1fb3}', 8124), ('\u{1fb4}', 4194307), ('\u{1fb7}', 4194308), ('\u{1fbc}', 8124), + ('\u{1fc2}', 4194309), ('\u{1fc3}', 8140), ('\u{1fc4}', 4194310), ('\u{1fc7}', 4194311), + ('\u{1fcc}', 8140), ('\u{1ff2}', 4194312), ('\u{1ff3}', 8188), ('\u{1ff4}', 4194313), + ('\u{1ff7}', 4194314), ('\u{1ffc}', 8188), ('\u{fb00}', 4194315), ('\u{fb01}', 4194316), + ('\u{fb02}', 4194317), ('\u{fb03}', 4194318), ('\u{fb04}', 4194319), ('\u{fb05}', 4194320), + ('\u{fb06}', 4194321), ('\u{fb13}', 4194322), ('\u{fb14}', 4194323), ('\u{fb15}', 4194324), + ('\u{fb16}', 4194325), ('\u{fb17}', 4194326), + ]; + + static TITLECASE_TABLE_MULTI: &[[char; 3]; 23] = &[ + ['S', 's', '\u{0}'], ['\u{535}', '\u{582}', '\u{0}'], ['\u{1fba}', '\u{345}', '\u{0}'], + ['\u{386}', '\u{345}', '\u{0}'], ['\u{391}', '\u{342}', '\u{345}'], + ['\u{1fca}', '\u{345}', '\u{0}'], ['\u{389}', '\u{345}', '\u{0}'], + ['\u{397}', '\u{342}', '\u{345}'], ['\u{1ffa}', '\u{345}', '\u{0}'], + ['\u{38f}', '\u{345}', '\u{0}'], ['\u{3a9}', '\u{342}', '\u{345}'], ['F', 'f', '\u{0}'], + ['F', 'i', '\u{0}'], ['F', 'l', '\u{0}'], ['F', 'f', 'i'], ['F', 'f', 'l'], + ['S', 't', '\u{0}'], ['S', 't', '\u{0}'], ['\u{544}', '\u{576}', '\u{0}'], + ['\u{544}', '\u{565}', '\u{0}'], ['\u{544}', '\u{56b}', '\u{0}'], + ['\u{54e}', '\u{576}', '\u{0}'], ['\u{544}', '\u{56d}', '\u{0}'], + ]; } diff --git a/library/coretests/tests/char.rs b/library/coretests/tests/char.rs index f0f6a24429284..aa20585953b7c 100644 --- a/library/coretests/tests/char.rs +++ b/library/coretests/tests/char.rs @@ -1,5 +1,6 @@ +use std::char::{self, CharCase}; +use std::str; use std::str::FromStr; -use std::{char, str}; #[test] fn test_convert() { @@ -39,6 +40,29 @@ fn test_from_str() { assert!(char::from_str("abc").is_err()); } +#[test] +fn test_is_cased() { + assert!('a'.is_cased()); + assert!('ö'.is_cased()); + assert!('ß'.is_cased()); + assert!('Ü'.is_cased()); + assert!('P'.is_cased()); + assert!('ª'.is_cased()); + assert!(!'攂'.is_cased()); +} + +#[test] +fn test_char_case() { + for c in '\0'..='\u{10FFFF}' { + match c.case() { + None => assert!(!c.is_cased()), + Some(CharCase::Lower) => assert!(c.is_lowercase()), + Some(CharCase::Upper) => assert!(c.is_uppercase()), + Some(CharCase::Title) => assert!(c.is_titlecase()), + } + } +} + #[test] fn test_is_lowercase() { assert!('a'.is_lowercase()); @@ -48,6 +72,17 @@ fn test_is_lowercase() { assert!(!'P'.is_lowercase()); } +#[test] +fn test_is_titlecase() { + assert!('Dž'.is_titlecase()); + assert!('ᾨ'.is_titlecase()); + assert!(!'h'.is_titlecase()); + assert!(!'ä'.is_titlecase()); + assert!(!'ß'.is_titlecase()); + assert!(!'Ö'.is_titlecase()); + assert!(!'T'.is_titlecase()); +} + #[test] fn test_is_uppercase() { assert!(!'h'.is_uppercase()); @@ -57,6 +92,26 @@ fn test_is_uppercase() { assert!('T'.is_uppercase()); } +#[test] +fn titlecase_fast_path() { + for c in '\0'..='\u{01C4}' { + assert!(!(c.is_cased() && !c.is_lowercase() && !c.is_uppercase())) + } +} + +#[test] +fn at_most_one_case() { + for c in '\0'..='\u{10FFFF}' { + assert_eq!( + !c.is_cased() as u8 + + c.is_lowercase() as u8 + + c.is_uppercase() as u8 + + c.is_titlecase() as u8, + 1 + ); + } +} + #[test] fn test_is_whitespace() { assert!(' '.is_whitespace()); diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs index e190536abcf9f..1157651da604d 100644 --- a/library/coretests/tests/lib.rs +++ b/library/coretests/tests/lib.rs @@ -109,6 +109,7 @@ #![feature(str_internals)] #![feature(strict_provenance_lints)] #![feature(test)] +#![feature(titlecase)] #![feature(trusted_len)] #![feature(trusted_random_access)] #![feature(try_blocks)] diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 49aef3ec33ec7..80e7ac0051b6a 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -6,7 +6,7 @@ use crate::{UnicodeData, fmt_list}; const INDEX_MASK: u32 = 1 << 22; -pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) { +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 3]) { let mut file = String::new(); write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap(); @@ -18,7 +18,10 @@ pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) file.push_str("\n\n"); let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper); file.push_str(&upper_tables); - (file, [lower_size, upper_size]) + file.push_str("\n\n"); + let (title_tables, title_size) = generate_tables("TITLE", &data.to_title); + file.push_str(&title_tables); + (file, [lower_size, upper_size, title_size]) } fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { @@ -117,4 +120,21 @@ pub fn to_upper(c: char) -> [char; 3] { .unwrap_or([c, '\0', '\0']) } } + +pub fn to_title(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_uppercase() as char, '\0', '\0'] + } else { + TITLECASE_TABLE + .binary_search_by(|&(key, _)| key.cmp(&c)) + .map(|i| { + let u = TITLECASE_TABLE[i].1; + char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { + // SAFETY: Index comes from statically generated table + unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + }) + }) + .unwrap_or(to_upper(c)) + } +} "; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index ded9205ffc4b9..5c848dd1c8822 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -99,32 +99,25 @@ static PROPERTIES: &[&str] = &[ struct UnicodeData { ranges: Vec<(&'static str, Vec>)>, + /// Only stores mappings that are not to self to_upper: BTreeMap, + /// Only stores mappings that differ from `to_upper` + to_title: BTreeMap, + /// Only stores mappings that are not to self to_lower: BTreeMap, } -fn to_mapping(origin: u32, codepoints: Vec) -> Option<[u32; 3]> { - let mut a = None; - let mut b = None; - let mut c = None; - - for codepoint in codepoints { - if origin == codepoint.value() { - return None; - } - - if a.is_none() { - a = Some(codepoint.value()); - } else if b.is_none() { - b = Some(codepoint.value()); - } else if c.is_none() { - c = Some(codepoint.value()); - } else { - panic!("more than 3 mapped codepoints") - } +fn to_mapping( + if_different_from: &[ucd_parse::Codepoint], + codepoints: &[ucd_parse::Codepoint], +) -> Option<[u32; 3]> { + if codepoints == if_different_from { + return None; } - Some([a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)]) + let mut ret = [ucd_parse::Codepoint::default(); 3]; + ret[0..codepoints.len()].copy_from_slice(codepoints); + Some(ret.map(ucd_parse::Codepoint::value)) } static UNICODE_DIRECTORY: &str = "unicode-downloads"; @@ -146,6 +139,7 @@ fn load_data() -> UnicodeData { let mut to_lower = BTreeMap::new(); let mut to_upper = BTreeMap::new(); + let mut to_title = BTreeMap::new(); for row in ucd_parse::UnicodeDataExpander::new( ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), ) { @@ -171,6 +165,11 @@ fn load_data() -> UnicodeData { { to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } + if let Some(mapped) = row.simple_titlecase_mapping + && Some(mapped) != row.simple_uppercase_mapping + { + to_title.insert(row.codepoint.value(), [mapped.value(), 0, 0]); + } } for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { @@ -180,12 +179,15 @@ fn load_data() -> UnicodeData { } let key = row.codepoint.value(); - if let Some(lower) = to_mapping(key, row.lowercase) { + if let Some(lower) = to_mapping(&[row.codepoint], &row.lowercase) { to_lower.insert(key, lower); } - if let Some(upper) = to_mapping(key, row.uppercase) { + if let Some(upper) = to_mapping(&[row.codepoint], &row.uppercase) { to_upper.insert(key, upper); } + if let Some(title) = to_mapping(&row.uppercase, &row.titlecase) { + to_title.insert(key, title); + } } let mut properties: Vec<(&'static str, Vec>)> = properties @@ -203,7 +205,7 @@ fn load_data() -> UnicodeData { .collect(); properties.sort_by_key(|p| p.0); - UnicodeData { ranges: properties, to_lower, to_upper } + UnicodeData { ranges: properties, to_lower, to_title, to_upper } } fn main() {