diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 26d238154a3ba..af141aeb83019 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -160,6 +160,7 @@ #![feature(std_internals)] #![feature(str_internals)] #![feature(strict_provenance)] +#![feature(titlecase)] #![feature(trusted_fused)] #![feature(trusted_len)] #![feature(trusted_random_access)] diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index ade114678b7f9..9421b62975690 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -410,9 +410,9 @@ impl str { } fn case_ignorable_then_cased>(iter: I) -> bool { - use core::unicode::{Case_Ignorable, Cased}; + use core::unicode::Case_Ignorable; match iter.skip_while(|&c| Case_Ignorable(c)).next() { - Some(c) => Cased(c), + Some(c) => c.is_cased(), None => false, } } diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 65ae483183901..cf5bd699b763c 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -742,8 +742,73 @@ impl char { #[inline] pub fn is_alphabetic(self) -> bool { match self { - 'a'..='z' | 'A'..='Z' => true, - c => c > '\x7f' && unicode::Alphabetic(c), + 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Alphabetic(self), + } + } + + /// Returns `true` if this `char` has the `Cased` property. + /// A character is cased if and only if it is uppercase, lowercase, or titlecase. + /// + /// `Cased` is described in Chapter 3 (Conformance) of the [Unicode Standard] and + /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(titlecase)] + /// assert!('A'.is_cased()); + /// assert!('a'.is_cased()); + /// assert!(!'京'.is_cased()); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn is_cased(self) -> bool { + match self { + 'A'..='Z' | 'a'..='z' => true, + '\0'..='\u{A9}' => false, + _ => unicode::Cased(self), + } + } + + /// Returns the case of this character: + /// [`Some(CharCase::Upper)`][`CharCase::Upper`] if [`self.is_uppercase()`][`char::is_uppercase`], + /// [`Some(CharCase::Lower)`][`CharCase::Lower`] if [`self.is_lowercase()`][`char::is_lowercase`], + /// [`Some(CharCase::Title)`][`CharCase::Title`] if [`self.is_titlecase()`][`char::is_titlecase`], and + /// `None` if [`!self.is_cased()`][`char::is_cased`]. + /// + /// # Examples + /// + /// ``` + /// #![feature(titlecase)] + /// use core::char::CharCase; + /// assert_eq!('a'.case(), Some(CharCase::Lower)); + /// assert_eq!('δ'.case(), Some(CharCase::Lower)); + /// assert_eq!('A'.case(), Some(CharCase::Upper)); + /// assert_eq!('Δ'.case(), Some(CharCase::Upper)); + /// assert_eq!('Dž'.case(), Some(CharCase::Title)); + /// assert_eq!('中'.case(), None); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn case(self) -> Option { + match self { + 'A'..='Z' => Some(CharCase::Upper), + 'a'..='z' => Some(CharCase::Lower), + '\0'..='\u{A9}' => None, + _ if !self.is_cased() => None, + _ if self.is_lowercase() => Some(CharCase::Lower), + _ if self.is_uppercase() => Some(CharCase::Upper), + _ => Some(CharCase::Title), } } @@ -785,7 +850,41 @@ impl char { pub const fn is_lowercase(self) -> bool { match self { 'a'..='z' => true, - c => c > '\x7f' && unicode::Lowercase(c), + '\0'..='\u{A9}' => false, + _ => unicode::Lowercase(self), + } + } + + /// Returns `true` if this `char` has the general category for titlecase letters. + /// + /// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4 + /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character + /// Database][ucd] [`UnicodeData.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(titlecase)] + /// assert!('Dž'.is_titlecase()); + /// assert!('ᾨ'.is_titlecase()); + /// assert!(!'D'.is_titlecase()); + /// assert!(!'z'.is_titlecase()); + /// assert!(!'中'.is_titlecase()); + /// assert!(!' '.is_titlecase()); + /// ``` + #[must_use] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn is_titlecase(self) -> bool { + match self { + '\0'..='\u{01C4}' => false, + _ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(), } } @@ -827,7 +926,8 @@ impl char { pub const fn is_uppercase(self) -> bool { match self { 'A'..='Z' => true, - c => c > '\x7f' && unicode::Uppercase(c), + '\0'..='\u{BF}' => false, + _ => unicode::Uppercase(self), } } @@ -859,7 +959,8 @@ impl char { pub fn is_whitespace(self) -> bool { match self { ' ' | '\x09'..='\x0d' => true, - c => c > '\x7f' && unicode::White_Space(c), + '\0'..='\u{84}' => false, + _ => unicode::White_Space(self), } } @@ -927,7 +1028,7 @@ impl char { #[must_use] #[inline] pub(crate) fn is_grapheme_extended(self) -> bool { - self > '\x7f' && unicode::Grapheme_Extend(self) + self > '\u{02FF}' && unicode::Grapheme_Extend(self) } /// Returns `true` if this `char` has one of the general categories for numbers. @@ -969,12 +1070,14 @@ impl char { pub fn is_numeric(self) -> bool { match self { '0'..='9' => true, - c => c > '\x7f' && unicode::N(c), + '\0'..='\u{B1}' => false, + _ => unicode::N(self), } } /// Returns an iterator that yields the lowercase mapping of this `char` as one or more - /// `char`s. + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. /// /// If this `char` does not have a lowercase mapping, the iterator yields the same `char`. /// @@ -1032,7 +1135,14 @@ impl char { /// // convert into themselves. /// assert_eq!('山'.to_lowercase().to_string(), "山"); /// ``` - #[must_use = "this returns the lowercase character as a new iterator, \ + /// + /// Check if a string is in lowercase: + /// + /// ``` + /// let s = "abcde\u{0301} 山"; + /// assert!(s.chars().all(|c| c.to_lowercase() == c)); + /// ``` + #[must_use = "this returns the lowercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] @@ -1040,8 +1150,123 @@ impl char { ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) } + /// Returns an iterator that yields the titlecase mapping of this `char` as one or more + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. + /// + /// If this `char` does not have an titlecase mapping, the iterator yields the same `char`. + /// + /// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character + /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. + /// + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + /// + /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields + /// the `char`(s) given by [`SpecialCasing.txt`]. + /// + /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt + /// + /// This operation performs an unconditional mapping without tailoring. That is, the conversion + /// is independent of context and language. + /// + /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in + /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// + /// # Examples + /// + /// As an iterator: + /// + /// ``` + /// #![feature(titlecase)] + /// for c in 'ß'.to_titlecase() { + /// print!("{c}"); + /// } + /// println!(); + /// ``` + /// + /// Using `println!` directly: + /// + /// ``` + /// #![feature(titlecase)] + /// println!("{}", 'ß'.to_titlecase()); + /// ``` + /// + /// Both are equivalent to: + /// + /// ``` + /// #![feature(titlecase)] + /// println!("Ss"); + /// ``` + /// + /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): + /// + /// ``` + /// #![feature(titlecase)] + /// assert_eq!('c'.to_titlecase().to_string(), "C"); + /// + /// // Sometimes the result is more than one character: + /// assert_eq!('ß'.to_titlecase().to_string(), "Ss"); + /// + /// // Characters that do not have separate cased forms + /// // convert into themselves. + /// assert_eq!('山'.to_titlecase().to_string(), "山"); + /// ``` + /// + /// Check if a word is in titlecase: + /// + /// ``` + /// #![feature(titlecase)] + /// let word = "Dross"; + /// let mut chars = word.chars(); + /// let first_cased_char = chars.find(|c| c.is_cased()); + /// let word_is_in_titlecase = if let Some(f) = first_cased_char { + /// f.to_titlecase() == f && chars.all(|c| c.to_lowercase() == c) + /// } else { + /// true + /// }; + /// assert!(word_is_in_titlecase); + /// ``` + /// + /// # Note on locale + /// + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: + /// + /// * 'Dotless': I / ı, sometimes written ï + /// * 'Dotted': İ / i + /// + /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: + /// + /// ``` + /// #![feature(titlecase)] + /// let upper_i = 'i'.to_titlecase().to_string(); + /// ``` + /// + /// The value of `upper_i` here relies on the language of the text: if we're + /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should + /// be `"İ"`. `to_titlecase()` does not take this into account, and so: + /// + /// ``` + /// #![feature(titlecase)] + /// let upper_i = 'i'.to_titlecase().to_string(); + /// + /// assert_eq!(upper_i, "I"); + /// ``` + /// + /// holds across languages. + #[must_use = "this returns the titlecased character as a new iterator, \ + without modifying the original"] + #[unstable(feature = "titlecase", issue = "none")] + #[inline] + pub fn to_titlecase(self) -> ToTitlecase { + ToTitlecase(CaseMappingIter::new(conversions::to_title(self))) + } + /// Returns an iterator that yields the uppercase mapping of this `char` as one or more - /// `char`s. + /// `char`s. The iterator also has implementations of [`Display`][core::fmt::Display] + /// and [`PartialEq`]. /// /// If this `char` does not have an uppercase mapping, the iterator yields the same `char`. /// @@ -1100,9 +1325,16 @@ impl char { /// assert_eq!('山'.to_uppercase().to_string(), "山"); /// ``` /// + /// Check if a string is in uppercase: + /// + /// ``` + /// let s = "ABCDE\u{0301} 山"; + /// assert!(s.chars().all(|c| c.to_uppercase() == c)); + /// ``` + /// /// # Note on locale /// - /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: /// /// * 'Dotless': I / ı, sometimes written ï /// * 'Dotted': İ / i @@ -1114,7 +1346,7 @@ impl char { /// ``` /// /// The value of `upper_i` here relies on the language of the text: if we're - /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should + /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should /// be `"İ"`. `to_uppercase()` does not take this into account, and so: /// /// ``` @@ -1124,7 +1356,7 @@ impl char { /// ``` /// /// holds across languages. - #[must_use = "this returns the uppercase character as a new iterator, \ + #[must_use = "this returns the uppercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index a860c7c6aaadc..fae8f1ea99ff3 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -374,13 +374,22 @@ impl fmt::Display for EscapeDebug { } macro_rules! casemappingiter_impls { - ($(#[$attr:meta])* $ITER_NAME:ident) => { + ( + #[$stab:meta] + #[$dendstab:meta] + #[$fusedstab:meta] + #[$exactstab:meta] + #[$displaystab:meta] + #[$partialstab:meta] + $(#[$attr:meta])* + $ITER_NAME:ident + ) => { $(#[$attr])* - #[stable(feature = "rust1", since = "1.0.0")] + #[$stab] #[derive(Debug, Clone)] pub struct $ITER_NAME(CaseMappingIter); - #[stable(feature = "rust1", since = "1.0.0")] + #[$stab] impl Iterator for $ITER_NAME { type Item = char; fn next(&mut self) -> Option { @@ -416,7 +425,7 @@ macro_rules! casemappingiter_impls { } } - #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[$dendstab] impl DoubleEndedIterator for $ITER_NAME { fn next_back(&mut self) -> Option { self.0.next_back() @@ -434,10 +443,10 @@ macro_rules! casemappingiter_impls { } } - #[stable(feature = "fused", since = "1.26.0")] + #[$fusedstab] impl FusedIterator for $ITER_NAME {} - #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[$exactstab] impl ExactSizeIterator for $ITER_NAME { fn len(&self) -> usize { self.0.len() @@ -464,33 +473,93 @@ macro_rules! casemappingiter_impls { #[unstable(feature = "std_internals", issue = "none")] unsafe impl TrustedRandomAccess for $ITER_NAME {} - #[stable(feature = "char_struct_display", since = "1.16.0")] + #[$displaystab] impl fmt::Display for $ITER_NAME { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) } } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToUppercase) -> bool { + self.0 == other.0 + } + } + + #[unstable(feature = "titlecase", issue = "none")] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToTitlecase) -> bool { + self.0 == other.0 + } + } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &ToLowercase) -> bool { + self.0 == other.0 + } + } + + #[$partialstab] + impl PartialEq for $ITER_NAME { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0 == *other + } + } } } casemappingiter_impls! { - /// Returns an iterator that yields the lowercase equivalent of a `char`. + #[stable(feature = "rust1", since = "1.0.0")] + #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[stable(feature = "fused", since = "1.26.0")] + #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[stable(feature = "char_struct_display", since = "1.16.0")] + #[stable(feature = "iter_partialeq", since = "CURRENT_RUSTC_VERSION")] + /// Returns an iterator that yields the uppercase equivalent of a `char`. /// - /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See + /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See /// its documentation for more. /// - /// [`to_lowercase`]: char::to_lowercase - ToLowercase + /// [`to_uppercase`]: char::to_uppercase + ToUppercase } casemappingiter_impls! { - /// Returns an iterator that yields the uppercase equivalent of a `char`. + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + #[unstable(feature = "titlecase", issue = "none")] + /// Returns an iterator that yields the titlecase equivalent of a `char`. /// - /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See + /// This `struct` is created by the [`to_titlecase`] method on [`char`]. See /// its documentation for more. /// - /// [`to_uppercase`]: char::to_uppercase - ToUppercase + /// [`to_titlecase`]: char::to_titlecase + ToTitlecase +} + +casemappingiter_impls! { + #[stable(feature = "rust1", since = "1.0.0")] + #[stable(feature = "case_mapping_double_ended", since = "1.59.0")] + #[stable(feature = "fused", since = "1.26.0")] + #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] + #[stable(feature = "char_struct_display", since = "1.16.0")] + #[stable(feature = "iter_partialeq", since = "CURRENT_RUSTC_VERSION")] + /// Returns an iterator that yields the lowercase equivalent of a `char`. + /// + /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See + /// its documentation for more. + /// + /// [`to_lowercase`]: char::to_lowercase + ToLowercase } #[derive(Debug, Clone)] @@ -599,6 +668,22 @@ impl fmt::Display for CaseMappingIter { } } +impl PartialEq for CaseMappingIter { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.0.as_slice() == other.0.as_slice() + } +} + +impl Eq for CaseMappingIter {} + +impl PartialEq for CaseMappingIter { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0.as_slice() == &[*other] + } +} + /// The error type returned when a checked char conversion fails. #[stable(feature = "u8_from_char", since = "1.59.0")] #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -613,3 +698,16 @@ impl fmt::Display for TryFromCharError { #[stable(feature = "u8_from_char", since = "1.59.0")] impl Error for TryFromCharError {} + +/// The case of a cased character, +/// as returned by [`char::case`]. +#[unstable(feature = "titlecase", issue = "none")] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum CharCase { + /// Lowercase. Corresponds to the `Lowercase` Unicode property. + Lower = 0b00, + /// Titlecase. Corresponds to the `Titlecase_Letter` Unicode general category. + Title = 0b10, + /// Uppercase. Corresponds to the `Uppercase` Unicode property. + Upper = 0b11, +} diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index e1faa407d54c5..4dafe260f941b 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -18,11 +18,10 @@ mod unicode_data; pub const UNICODE_VERSION: (u8, u8, u8) = unicode_data::UNICODE_VERSION; // For use in alloc, not re-exported in std. -pub use unicode_data::{ - case_ignorable::lookup as Case_Ignorable, cased::lookup as Cased, conversions, -}; +pub use unicode_data::{case_ignorable::lookup as Case_Ignorable, conversions}; pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; +pub(crate) use unicode_data::cased::lookup as Cased; pub(crate) use unicode_data::cc::lookup as Cc; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index dd2ad9a58f679..7e5c8df4f9c3c 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -616,7 +616,24 @@ pub mod conversions { } } - static LOWERCASE_TABLE: &[(char, u32)] = &[ + pub fn to_title(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_uppercase() as char, '\0', '\0'] + } else { + TITLECASE_TABLE + .binary_search_by(|&(key, _)| key.cmp(&c)) + .map(|i| { + let u = TITLECASE_TABLE[i].1; + char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { + // SAFETY: Index comes from statically generated table + unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + }) + }) + .unwrap_or(to_upper(c)) + } + } + + static LOWERCASE_TABLE: [(char, u32); 1407] = [ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), ('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238), @@ -959,11 +976,11 @@ pub mod conversions { ('\u{1e920}', 125250), ('\u{1e921}', 125251), ]; - static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static LOWERCASE_TABLE_MULTI: [[char; 3]; 1] = [ ['i', '\u{307}', '\u{0}'], ]; - static UPPERCASE_TABLE: &[(char, u32)] = &[ + static UPPERCASE_TABLE: [(char, u32); 1499] = [ ('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194), ('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199), ('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204), @@ -1330,7 +1347,7 @@ pub mod conversions { ('\u{1e942}', 125216), ('\u{1e943}', 125217), ]; - static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static UPPERCASE_TABLE_MULTI: [[char; 3]; 102] = [ ['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'], ['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'], ['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'], @@ -1380,4 +1397,53 @@ pub mod conversions { ['\u{544}', '\u{53b}', '\u{0}'], ['\u{54e}', '\u{546}', '\u{0}'], ['\u{544}', '\u{53d}', '\u{0}'], ]; + + static TITLECASE_TABLE: [(char, u32); 135] = [ + ('\u{df}', 4194304), ('\u{1c4}', 453), ('\u{1c5}', 453), ('\u{1c6}', 453), + ('\u{1c7}', 456), ('\u{1c8}', 456), ('\u{1c9}', 456), ('\u{1ca}', 459), ('\u{1cb}', 459), + ('\u{1cc}', 459), ('\u{1f1}', 498), ('\u{1f2}', 498), ('\u{1f3}', 498), + ('\u{587}', 4194305), ('\u{10d0}', 4304), ('\u{10d1}', 4305), ('\u{10d2}', 4306), + ('\u{10d3}', 4307), ('\u{10d4}', 4308), ('\u{10d5}', 4309), ('\u{10d6}', 4310), + ('\u{10d7}', 4311), ('\u{10d8}', 4312), ('\u{10d9}', 4313), ('\u{10da}', 4314), + ('\u{10db}', 4315), ('\u{10dc}', 4316), ('\u{10dd}', 4317), ('\u{10de}', 4318), + ('\u{10df}', 4319), ('\u{10e0}', 4320), ('\u{10e1}', 4321), ('\u{10e2}', 4322), + ('\u{10e3}', 4323), ('\u{10e4}', 4324), ('\u{10e5}', 4325), ('\u{10e6}', 4326), + ('\u{10e7}', 4327), ('\u{10e8}', 4328), ('\u{10e9}', 4329), ('\u{10ea}', 4330), + ('\u{10eb}', 4331), ('\u{10ec}', 4332), ('\u{10ed}', 4333), ('\u{10ee}', 4334), + ('\u{10ef}', 4335), ('\u{10f0}', 4336), ('\u{10f1}', 4337), ('\u{10f2}', 4338), + ('\u{10f3}', 4339), ('\u{10f4}', 4340), ('\u{10f5}', 4341), ('\u{10f6}', 4342), + ('\u{10f7}', 4343), ('\u{10f8}', 4344), ('\u{10f9}', 4345), ('\u{10fa}', 4346), + ('\u{10fd}', 4349), ('\u{10fe}', 4350), ('\u{10ff}', 4351), ('\u{1f80}', 8072), + ('\u{1f81}', 8073), ('\u{1f82}', 8074), ('\u{1f83}', 8075), ('\u{1f84}', 8076), + ('\u{1f85}', 8077), ('\u{1f86}', 8078), ('\u{1f87}', 8079), ('\u{1f88}', 8072), + ('\u{1f89}', 8073), ('\u{1f8a}', 8074), ('\u{1f8b}', 8075), ('\u{1f8c}', 8076), + ('\u{1f8d}', 8077), ('\u{1f8e}', 8078), ('\u{1f8f}', 8079), ('\u{1f90}', 8088), + ('\u{1f91}', 8089), ('\u{1f92}', 8090), ('\u{1f93}', 8091), ('\u{1f94}', 8092), + ('\u{1f95}', 8093), ('\u{1f96}', 8094), ('\u{1f97}', 8095), ('\u{1f98}', 8088), + ('\u{1f99}', 8089), ('\u{1f9a}', 8090), ('\u{1f9b}', 8091), ('\u{1f9c}', 8092), + ('\u{1f9d}', 8093), ('\u{1f9e}', 8094), ('\u{1f9f}', 8095), ('\u{1fa0}', 8104), + ('\u{1fa1}', 8105), ('\u{1fa2}', 8106), ('\u{1fa3}', 8107), ('\u{1fa4}', 8108), + ('\u{1fa5}', 8109), ('\u{1fa6}', 8110), ('\u{1fa7}', 8111), ('\u{1fa8}', 8104), + ('\u{1fa9}', 8105), ('\u{1faa}', 8106), ('\u{1fab}', 8107), ('\u{1fac}', 8108), + ('\u{1fad}', 8109), ('\u{1fae}', 8110), ('\u{1faf}', 8111), ('\u{1fb2}', 4194306), + ('\u{1fb3}', 8124), ('\u{1fb4}', 4194307), ('\u{1fb7}', 4194308), ('\u{1fbc}', 8124), + ('\u{1fc2}', 4194309), ('\u{1fc3}', 8140), ('\u{1fc4}', 4194310), ('\u{1fc7}', 4194311), + ('\u{1fcc}', 8140), ('\u{1ff2}', 4194312), ('\u{1ff3}', 8188), ('\u{1ff4}', 4194313), + ('\u{1ff7}', 4194314), ('\u{1ffc}', 8188), ('\u{fb00}', 4194315), ('\u{fb01}', 4194316), + ('\u{fb02}', 4194317), ('\u{fb03}', 4194318), ('\u{fb04}', 4194319), ('\u{fb05}', 4194320), + ('\u{fb06}', 4194321), ('\u{fb13}', 4194322), ('\u{fb14}', 4194323), ('\u{fb15}', 4194324), + ('\u{fb16}', 4194325), ('\u{fb17}', 4194326), + ]; + + static TITLECASE_TABLE_MULTI: [[char; 3]; 23] = [ + ['S', 's', '\u{0}'], ['\u{535}', '\u{582}', '\u{0}'], ['\u{1fba}', '\u{345}', '\u{0}'], + ['\u{386}', '\u{345}', '\u{0}'], ['\u{391}', '\u{342}', '\u{345}'], + ['\u{1fca}', '\u{345}', '\u{0}'], ['\u{389}', '\u{345}', '\u{0}'], + ['\u{397}', '\u{342}', '\u{345}'], ['\u{1ffa}', '\u{345}', '\u{0}'], + ['\u{38f}', '\u{345}', '\u{0}'], ['\u{3a9}', '\u{342}', '\u{345}'], ['F', 'f', '\u{0}'], + ['F', 'i', '\u{0}'], ['F', 'l', '\u{0}'], ['F', 'f', 'i'], ['F', 'f', 'l'], + ['S', 't', '\u{0}'], ['S', 't', '\u{0}'], ['\u{544}', '\u{576}', '\u{0}'], + ['\u{544}', '\u{565}', '\u{0}'], ['\u{544}', '\u{56b}', '\u{0}'], + ['\u{54e}', '\u{576}', '\u{0}'], ['\u{544}', '\u{56d}', '\u{0}'], + ]; } diff --git a/library/core/tests/char.rs b/library/core/tests/char.rs index 6422387e9560b..0b7cb35fae978 100644 --- a/library/core/tests/char.rs +++ b/library/core/tests/char.rs @@ -1,5 +1,8 @@ use std::str::FromStr; -use std::{char, str}; +use std::{ + char::{self, CharCase}, + str, +}; #[test] fn test_convert() { @@ -41,6 +44,29 @@ fn test_from_str() { assert!(char::from_str("abc").is_err()); } +#[test] +fn test_is_cased() { + assert!('a'.is_cased()); + assert!('ö'.is_cased()); + assert!('ß'.is_cased()); + assert!('Ü'.is_cased()); + assert!('P'.is_cased()); + assert!('ª'.is_cased()); + assert!(!'攂'.is_cased()); +} + +#[test] +fn test_char_case() { + for c in '\0'..='\u{10FFFF}' { + match c.case() { + None => assert!(!c.is_cased()), + Some(CharCase::Lower) => assert!(c.is_lowercase()), + Some(CharCase::Upper) => assert!(c.is_uppercase()), + Some(CharCase::Title) => assert!(c.is_titlecase()), + } + } +} + #[test] fn test_is_lowercase() { assert!('a'.is_lowercase()); @@ -50,6 +76,17 @@ fn test_is_lowercase() { assert!(!'P'.is_lowercase()); } +#[test] +fn test_is_titlecase() { + assert!('Dž'.is_titlecase()); + assert!('ᾨ'.is_titlecase()); + assert!(!'h'.is_titlecase()); + assert!(!'ä'.is_titlecase()); + assert!(!'ß'.is_titlecase()); + assert!(!'Ö'.is_titlecase()); + assert!(!'T'.is_titlecase()); +} + #[test] fn test_is_uppercase() { assert!(!'h'.is_uppercase()); @@ -59,6 +96,26 @@ fn test_is_uppercase() { assert!('T'.is_uppercase()); } +#[test] +fn titlecase_fast_path() { + for c in '\0'..='\u{01C4}' { + assert!(!(c.is_cased() && !c.is_lowercase() && !c.is_uppercase())) + } +} + +#[test] +fn at_most_one_case() { + for c in '\0'..='\u{10FFFF}' { + assert_eq!( + !c.is_cased() as u8 + + c.is_lowercase() as u8 + + c.is_uppercase() as u8 + + c.is_titlecase() as u8, + 1 + ); + } +} + #[test] fn test_is_whitespace() { assert!(' '.is_whitespace()); diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs index 421062f5873cd..c52e72d62810b 100644 --- a/library/core/tests/lib.rs +++ b/library/core/tests/lib.rs @@ -70,6 +70,7 @@ #![feature(str_internals)] #![feature(std_internals)] #![feature(test)] +#![feature(titlecase)] #![feature(trusted_len)] #![feature(try_blocks)] #![feature(try_trait_v2)] diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 7a978db62b40d..c285d2a994d98 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -17,6 +17,8 @@ pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String { file.push_str(&generate_tables("LOWER", &data.to_lower)); file.push_str("\n\n"); file.push_str(&generate_tables("UPPER", &data.to_upper)); + file.push_str("\n\n"); + file.push_str(&generate_tables("TITLE", &data.to_title)); file } @@ -48,13 +50,25 @@ fn generate_tables(case: &str, data: &BTreeMap) -> String let mut tables = String::new(); - write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings)) - .unwrap(); + write!( + tables, + "static {}CASE_TABLE: [(char, u32); {}] = [{}];", + case, + mappings.len(), + fmt_list(mappings) + ) + .unwrap(); tables.push_str("\n\n"); - write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis)) - .unwrap(); + write!( + tables, + "static {}CASE_TABLE_MULTI: [[char; 3]; {}] = [{}];", + case, + multis.len(), + fmt_list(multis) + ) + .unwrap(); tables } @@ -101,4 +115,21 @@ pub fn to_upper(c: char) -> [char; 3] { .unwrap_or([c, '\0', '\0']) } } + +pub fn to_title(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_uppercase() as char, '\0', '\0'] + } else { + TITLECASE_TABLE + .binary_search_by(|&(key, _)| key.cmp(&c)) + .map(|i| { + let u = TITLECASE_TABLE[i].1; + char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| { + // SAFETY: Index comes from statically generated table + unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) } + }) + }) + .unwrap_or(to_upper(c)) + } +} "; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 2fe578acd90e7..dd340ccece27f 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -97,20 +97,20 @@ static PROPERTIES: &[&str] = &[ struct UnicodeData { ranges: Vec<(&'static str, Vec>)>, + /// Only stores mappings that are not to self to_upper: BTreeMap, + /// Only stores mappings that differ from `to_upper` + to_title: BTreeMap, + /// Only stores mappings that are not to self to_lower: BTreeMap, } -fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32, u32, u32)> { +fn to_mapping(codepoints: Vec) -> Option<(u32, u32, u32)> { let mut a = None; let mut b = None; let mut c = None; for codepoint in codepoints { - if origin == codepoint.value() { - return None; - } - if a.is_none() { a = Some(codepoint.value()); } else if b.is_none() { @@ -144,6 +144,7 @@ fn load_data() -> UnicodeData { let mut to_lower = BTreeMap::new(); let mut to_upper = BTreeMap::new(); + let mut to_title = BTreeMap::new(); for row in ucd_parse::UnicodeDataExpander::new( ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), ) { @@ -169,6 +170,11 @@ fn load_data() -> UnicodeData { to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); } } + if let Some(mapped) = row.simple_titlecase_mapping { + if Some(mapped) != row.simple_uppercase_mapping { + to_title.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + } + } } for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { @@ -178,11 +184,21 @@ fn load_data() -> UnicodeData { } let key = row.codepoint.value(); - if let Some(lower) = to_mapping(key, row.lowercase) { - to_lower.insert(key, lower); + if let Some(lower) = to_mapping(row.lowercase) { + if lower != (key, 0, 0) { + to_lower.insert(key, lower); + } + } + let upper_mapping = to_mapping(row.uppercase); + if let Some(upper) = upper_mapping { + if upper != (key, 0, 0) { + to_upper.insert(key, upper); + } } - if let Some(upper) = to_mapping(key, row.uppercase) { - to_upper.insert(key, upper); + if let Some(title) = to_mapping(row.titlecase) { + if Some(title) != upper_mapping { + to_title.insert(key, title); + } } } @@ -214,7 +230,7 @@ fn load_data() -> UnicodeData { let mut properties = properties.into_iter().collect::>(); properties.sort_by_key(|p| p.0); - UnicodeData { ranges: properties, to_lower, to_upper } + UnicodeData { ranges: properties, to_lower, to_title, to_upper } } fn main() {