diff --git a/Cargo.lock b/Cargo.lock index a26ff30f5e4..f97f389a26b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -480,6 +480,17 @@ dependencies = [ "memchr", ] +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "dhat" version = "0.2.2" @@ -1335,6 +1346,7 @@ dependencies = [ "icu_benchmark_macros", "icu_provider", "litemap", + "num_enum", "postcard", "serde", "serde_json", @@ -1638,6 +1650,27 @@ dependencies = [ "libc", ] +[[package]] +name = "num_enum" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9bd055fb730c4f8f4f57d45d35cd6b3f0980535b056dc7ff119cee6a66ed6f" +dependencies = [ + "derivative", + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "486ea01961c4a818096de679a8b740b26d9033146ac5291b1c98557658f8cdd9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "object" version = "0.23.0" diff --git a/components/uniset/Cargo.toml b/components/uniset/Cargo.toml index 94473b8cf49..8c000d5f9e5 100644 --- a/components/uniset/Cargo.toml +++ b/components/uniset/Cargo.toml @@ -34,6 +34,7 @@ all-features = true [dependencies] icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } litemap = { version = "0.2", path = "../../utils/litemap" } +num_enum = { version = "0.5.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } tinystr = { version = "0.4.10", features = ["alloc"], default-features = false } displaydoc = { version = "0.2.3", default-features = false } diff --git a/components/uniset/src/enum_props.rs b/components/uniset/src/enum_props.rs index 0e230481840..07511890ee7 100644 --- a/components/uniset/src/enum_props.rs +++ b/components/uniset/src/enum_props.rs @@ -4,6 +4,8 @@ //! A collection of enums for enumerated properties. +use num_enum::{TryFromPrimitive, UnsafeFromPrimitive}; + /// Selection constants for Unicode properties. /// These constants are used to select one of the Unicode properties. /// See UProperty in ICU4C. @@ -16,249 +18,316 @@ pub enum EnumeratedProperty { ScriptExtensions = 0x7000, } +/// Enumerated Unicode general category types. +/// GeneralSubcategory only supports specific subcategories (eg UppercaseLetter). +/// It does not support grouped categories (eg Letter). For grouped categories, use GeneralCategory. +#[derive(Copy, Clone, PartialEq, Debug, TryFromPrimitive, UnsafeFromPrimitive)] +#[allow(missing_docs)] // TODO(#1030) - Add missing docs. +#[repr(u8)] +pub enum GeneralSubcategory { + Unassigned = 0, + + UppercaseLetter = 1, + LowercaseLetter = 2, + TitlecaseLetter = 3, + ModifierLetter = 4, + OtherLetter = 5, + + NonspacingMark = 6, + EnclosingMark = 7, + SpacingMark = 8, + + Digit = 9, + LetterNumber = 10, + OtherNumber = 11, + + SpaceSeparator = 12, + LineSeparator = 13, + ParagraphSeparator = 14, + + Control = 15, + Format = 16, + PrivateUse = 17, + Surrogate = 18, + + DashPunctuation = 19, + OpenPunctuation = 20, + ClosePunctuation = 21, + ConnectorPunctuation = 22, + OtherPunctuation = 23, + InitialPunctuation = 28, + FinalPunctuation = 29, + + MathSymbol = 24, + CurrencySymbol = 25, + ModifierSymbol = 26, + OtherSymbol = 27, +} + /// Enumerated Unicode general category types. /// The discriminants correspond to the U_GC_XX_MASK constants in ICU4C. -/// This supports groups of general categories: for example, `Letter` +/// Unlike GeneralSubcategory, this supports groups of general categories: for example, `Letter` /// is the union of `UppercaseLetter`, `LowercaseLetter`, etc... /// See https://www.unicode.org/reports/tr44/ . /// See UCharCategory and U_GET_GC_MASK in ICU4C. -#[derive(Clone, PartialEq, Debug)] +#[derive(Copy, Clone, PartialEq, Debug, Eq)] #[allow(missing_docs)] // TODO(#1030) - Add missing docs. -#[repr(u32)] -#[non_exhaustive] -pub enum GeneralCategory { - Unassigned = 0, +#[repr(transparent)] +pub struct GeneralCategory(pub(crate) u32); + +use GeneralCategory as GC; +use GeneralSubcategory as GS; - UppercaseLetter = 1 << 1, - LowercaseLetter = 1 << 2, - TitlecaseLetter = 1 << 3, - ModifierLetter = 1 << 4, - OtherLetter = 1 << 5, - CasedLetter = - Self::UppercaseLetter as u32 | Self::LowercaseLetter as u32 | Self::TitlecaseLetter as u32, - Letter = Self::CasedLetter as u32 | Self::ModifierLetter as u32 | Self::OtherLetter as u32, +#[allow(missing_docs)] // These constants don't need documentation. +#[allow(non_upper_case_globals)] +impl GeneralCategory { + pub const Unassigned: GeneralCategory = GC(1 << (GS::Unassigned as u32)); + pub const UppercaseLetter: GeneralCategory = GC(1 << (GS::UppercaseLetter as u32)); + pub const LowercaseLetter: GeneralCategory = GC(1 << (GS::LowercaseLetter as u32)); + pub const TitlecaseLetter: GeneralCategory = GC(1 << (GS::TitlecaseLetter as u32)); + pub const ModifierLetter: GeneralCategory = GC(1 << (GS::ModifierLetter as u32)); + pub const OtherLetter: GeneralCategory = GC(1 << (GS::OtherLetter as u32)); + pub const CasedLetter: GeneralCategory = GC(1 << (GS::UppercaseLetter as u32) + | 1 << (GS::LowercaseLetter as u32) + | 1 << (GS::TitlecaseLetter as u32)); + pub const Letter: GeneralCategory = GC(1 << (GS::UppercaseLetter as u32) + | 1 << (GS::LowercaseLetter as u32) + | 1 << (GS::TitlecaseLetter as u32) + | 1 << (GS::ModifierLetter as u32) + | 1 << (GS::OtherLetter as u32)); - NonspacingMark = 1 << 6, - EnclosingMark = 1 << 7, - SpacingMark = 1 << 8, - Mark = Self::NonspacingMark as u32 | Self::EnclosingMark as u32 | Self::SpacingMark as u32, + pub const NonspacingMark: GeneralCategory = GC(1 << (GS::NonspacingMark as u32)); + pub const EnclosingMark: GeneralCategory = GC(1 << (GS::EnclosingMark as u32)); + pub const SpacingMark: GeneralCategory = GC(1 << (GS::SpacingMark as u32)); + pub const Mark: GeneralCategory = GC(1 << (GS::NonspacingMark as u32) + | 1 << (GS::EnclosingMark as u32) + | 1 << (GS::SpacingMark as u32)); - Digit = 1 << 9, - LetterNumber = 1 << 10, - OtherNumber = 1 << 11, - Number = Self::Digit as u32 | Self::LetterNumber as u32 | Self::OtherNumber as u32, + pub const Digit: GeneralCategory = GC(1 << (GS::Digit as u32)); + pub const LetterNumber: GeneralCategory = GC(1 << (GS::LetterNumber as u32)); + pub const OtherNumber: GeneralCategory = GC(1 << (GS::OtherNumber as u32)); + pub const Number: GeneralCategory = GC(1 << (GS::Digit as u32) + | 1 << (GS::LetterNumber as u32) + | 1 << (GS::OtherNumber as u32)); - SpaceSeparator = 1 << 12, - LineSeparator = 1 << 13, - ParagraphSeparator = 1 << 14, - Separator = - Self::SpaceSeparator as u32 | Self::LineSeparator as u32 | Self::ParagraphSeparator as u32, + pub const SpaceSeparator: GeneralCategory = GC(1 << (GS::SpaceSeparator as u32)); + pub const LineSeparator: GeneralCategory = GC(1 << (GS::LineSeparator as u32)); + pub const ParagraphSeparator: GeneralCategory = GC(1 << (GS::ParagraphSeparator as u32)); + pub const Separator: GeneralCategory = GC(1 << (GS::SpaceSeparator as u32) + | 1 << (GS::LineSeparator as u32) + | 1 << (GS::ParagraphSeparator as u32)); - Control = 1 << 15, - Format = 1 << 16, - PrivateUse = 1 << 17, - Surrogate = 1 << 18, - Other = Self::Control as u32 - | Self::Format as u32 - | Self::PrivateUse as u32 - | Self::Surrogate as u32, + pub const Control: GeneralCategory = GC(1 << (GS::Control as u32)); + pub const Format: GeneralCategory = GC(1 << (GS::Format as u32)); + pub const PrivateUse: GeneralCategory = GC(1 << (GS::PrivateUse as u32)); + pub const Surrogate: GeneralCategory = GC(1 << (GS::Surrogate as u32)); + pub const Other: GeneralCategory = GC(1 << (GS::Control as u32) + | 1 << (GS::Format as u32) + | 1 << (GS::PrivateUse as u32) + | 1 << (GS::Surrogate as u32)); - DashPunctuation = 1 << 19, - OpenPunctuation = 1 << 20, - ClosePunctuation = 1 << 21, - ConnectorPunctuation = 1 << 22, - OtherPunctuation = 1 << 23, - InitialPunctuation = 1 << 28, - FinalPunctuation = 1 << 29, - Punctuation = Self::DashPunctuation as u32 - | Self::OpenPunctuation as u32 - | Self::ClosePunctuation as u32 - | Self::ConnectorPunctuation as u32 - | Self::OtherPunctuation as u32 - | Self::InitialPunctuation as u32 - | Self::FinalPunctuation as u32, + pub const DashPunctuation: GeneralCategory = GC(1 << (GS::DashPunctuation as u32)); + pub const OpenPunctuation: GeneralCategory = GC(1 << (GS::OpenPunctuation as u32)); + pub const ClosePunctuation: GeneralCategory = GC(1 << (GS::ClosePunctuation as u32)); + pub const ConnectorPunctuation: GeneralCategory = GC(1 << (GS::ConnectorPunctuation as u32)); + pub const OtherPunctuation: GeneralCategory = GC(1 << (GS::OtherPunctuation as u32)); + pub const InitialPunctuation: GeneralCategory = GC(1 << (GS::InitialPunctuation as u32)); + pub const FinalPunctuation: GeneralCategory = GC(1 << (GS::FinalPunctuation as u32)); + pub const Punctuation: GeneralCategory = GC(1 << (GS::DashPunctuation as u32) + | 1 << (GS::OpenPunctuation as u32) + | 1 << (GS::ClosePunctuation as u32) + | 1 << (GS::ConnectorPunctuation as u32) + | 1 << (GS::OtherPunctuation as u32) + | 1 << (GS::InitialPunctuation as u32) + | 1 << (GS::FinalPunctuation as u32)); - MathSymbol = 1 << 24, - CurrencySymbol = 1 << 25, - ModifierSymbol = 1 << 26, - OtherSymbol = 1 << 27, - Symbol = Self::MathSymbol as u32 - | Self::CurrencySymbol as u32 - | Self::ModifierSymbol as u32 - | Self::OtherSymbol as u32, + pub const MathSymbol: GeneralCategory = GC(1 << (GS::MathSymbol as u32)); + pub const CurrencySymbol: GeneralCategory = GC(1 << (GS::CurrencySymbol as u32)); + pub const ModifierSymbol: GeneralCategory = GC(1 << (GS::ModifierSymbol as u32)); + pub const OtherSymbol: GeneralCategory = GC(1 << (GS::OtherSymbol as u32)); + pub const Symbol: GeneralCategory = GC(1 << (GS::MathSymbol as u32) + | 1 << (GS::CurrencySymbol as u32) + | 1 << (GS::ModifierSymbol as u32) + | 1 << (GS::OtherSymbol as u32)); +} + +impl From for GeneralCategory { + fn from(subcategory: GeneralSubcategory) -> Self { + GeneralCategory(1 << (subcategory as u32)) + } } /// Enumerated property Script. /// /// For more information, see UAX #24: http://www.unicode.org/reports/tr24/. /// See UScriptCode in ICU4C. -/// -/// This enum only contains variants for scripts that are used in the Unicode -/// Property Database. -#[derive(Clone, PartialEq, Debug)] -#[allow(missing_docs)] // The variants should not need documenting. -#[non_exhaustive] -pub enum Script { - Adlam = 167, - Ahom = 161, - AnatolianHieroglyphs = 156, - Arabic = 2, - Armenian = 3, - Avestan = 117, - Balinese = 62, - Bamum = 130, - BassaVah = 134, - Batak = 63, - Bengali = 4, - Bhaiksuki = 168, - Bopomofo = 5, - Brahmi = 65, - Braille = 46, - Buginese = 55, - Buhid = 44, - CanadianAboriginal = 40, - Carian = 104, - CaucasianAlbanian = 159, - Chakma = 118, - Cham = 66, - Cherokee = 6, - Chorasmian = 189, - Common = 0, - Coptic = 7, - Cuneiform = 101, - Cypriot = 47, - CyproMinoan = 193, - Cyrillic = 8, - Deseret = 9, - Devanagari = 10, - DivesAkuru = 190, - Dogra = 178, - Duployan = 135, - EgyptianHieroglyphs = 71, - Elbasan = 136, - Elymaic = 185, - Ethiopic = 11, - Georgian = 12, - Glagolitic = 56, - Gothic = 13, - Grantha = 137, - Greek = 14, - Gujarati = 15, - GunjalaGondi = 179, - Gurmukhi = 16, - Han = 17, - Hangul = 18, - HanifiRohingya = 182, - Hanunoo = 43, - Hatran = 162, - Hebrew = 19, - Hiragana = 20, - ImperialAramaic = 116, - Inherited = 1, - InscriptionalPahlavi = 122, - InscriptionalParthian = 125, - Javanese = 78, - Kaithi = 120, - Kannada = 21, - Katakana = 22, - KayahLi = 79, - Kharoshthi = 57, - KhitanSmallScript = 191, - Khmer = 23, - Khojki = 157, - Khudawadi = 145, - Lao = 24, - Latin = 25, - Lepcha = 82, - Limbu = 48, - LinearA = 83, - LinearB = 49, - Lisu = 131, - Lycian = 107, - Lydian = 108, - Mahajani = 160, - Makasar = 180, - Malayalam = 26, - Mandaic = 84, - Manichaean = 121, - Marchen = 169, - MasaramGondi = 175, - Medefaidrin = 181, - MeeteiMayek = 115, - MendeKikakui = 140, - MeroiticCursive = 141, - MeroiticHieroglyphs = 86, - Miao = 92, - Modi = 163, - Mongolian = 27, - Mro = 149, - Multani = 164, - Myanmar = 28, - Nabataean = 143, - Nandinagari = 187, - NewTaiLue = 59, - Newa = 170, - Nko = 87, - Nushu = 150, - NyiakengPuachueHmong = 186, - Ogham = 29, - OlChiki = 109, - OldHungarian = 76, - OldItalic = 30, - OldNorthArabian = 142, - OldPermic = 89, - OldPersian = 61, - OldSogdian = 184, - OldSouthArabian = 133, - OldTurkic = 88, - OldUyghur = 194, - Oriya = 31, - Osage = 171, - Osmanya = 50, - PahawhHmong = 75, - Palmyrene = 144, - PauCinHau = 165, - PhagsPa = 90, - Phoenician = 91, - PsalterPahlavi = 123, - Rejang = 110, - Runic = 32, - Samaritan = 126, - Saurashtra = 111, - Sharada = 151, - Shavian = 51, - Siddham = 166, - SignWriting = 112, - Sinhala = 33, - Sogdian = 183, - SoraSompeng = 152, - Soyombo = 176, - Sundanese = 113, - SylotiNagri = 58, - Syriac = 34, - Tagalog = 42, - Tagbanwa = 45, - TaiLe = 52, - TaiTham = 106, - TaiViet = 127, - Takri = 153, - Tamil = 35, - Tangsa = 195, - Tangut = 154, - Telugu = 36, - Thaana = 37, - Thai = 38, - Tibetan = 39, - Tifinagh = 60, - Tirhuta = 158, - Toto = 196, - Ugaritic = 53, - Unknown = 103, - Vai = 99, - Vithkuqi = 197, - Wancho = 188, - WarangCiti = 146, - Yezidi = 192, - Yi = 41, - ZanabazarSquare = 177, +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(transparent)] +pub struct Script(pub(crate) u16); + +#[allow(missing_docs)] // These constants don't need documentation. +#[allow(non_upper_case_globals)] +impl Script { + pub const Adlam: Script = Script(167); + pub const Ahom: Script = Script(161); + pub const AnatolianHieroglyphs: Script = Script(156); + pub const Arabic: Script = Script(2); + pub const Armenian: Script = Script(3); + pub const Avestan: Script = Script(117); + pub const Balinese: Script = Script(62); + pub const Bamum: Script = Script(130); + pub const BassaVah: Script = Script(134); + pub const Batak: Script = Script(63); + pub const Bengali: Script = Script(4); + pub const Bhaiksuki: Script = Script(168); + pub const Bopomofo: Script = Script(5); + pub const Brahmi: Script = Script(65); + pub const Braille: Script = Script(46); + pub const Buginese: Script = Script(55); + pub const Buhid: Script = Script(44); + pub const CanadianAboriginal: Script = Script(40); + pub const Carian: Script = Script(104); + pub const CaucasianAlbanian: Script = Script(159); + pub const Chakma: Script = Script(118); + pub const Cham: Script = Script(66); + pub const Cherokee: Script = Script(6); + pub const Chorasmian: Script = Script(189); + pub const Common: Script = Script(0); + pub const Coptic: Script = Script(7); + pub const Cuneiform: Script = Script(101); + pub const Cypriot: Script = Script(47); + pub const CyproMinoan: Script = Script(193); + pub const Cyrillic: Script = Script(8); + pub const Deseret: Script = Script(9); + pub const Devanagari: Script = Script(10); + pub const DivesAkuru: Script = Script(190); + pub const Dogra: Script = Script(178); + pub const Duployan: Script = Script(135); + pub const EgyptianHieroglyphs: Script = Script(71); + pub const Elbasan: Script = Script(136); + pub const Elymaic: Script = Script(185); + pub const Ethiopic: Script = Script(11); + pub const Georgian: Script = Script(12); + pub const Glagolitic: Script = Script(56); + pub const Gothic: Script = Script(13); + pub const Grantha: Script = Script(137); + pub const Greek: Script = Script(14); + pub const Gujarati: Script = Script(15); + pub const GunjalaGondi: Script = Script(179); + pub const Gurmukhi: Script = Script(16); + pub const Han: Script = Script(17); + pub const Hangul: Script = Script(18); + pub const HanifiRohingya: Script = Script(182); + pub const Hanunoo: Script = Script(43); + pub const Hatran: Script = Script(162); + pub const Hebrew: Script = Script(19); + pub const Hiragana: Script = Script(20); + pub const ImperialAramaic: Script = Script(116); + pub const Inherited: Script = Script(1); + pub const InscriptionalPahlavi: Script = Script(122); + pub const InscriptionalParthian: Script = Script(125); + pub const Javanese: Script = Script(78); + pub const Kaithi: Script = Script(120); + pub const Kannada: Script = Script(21); + pub const Katakana: Script = Script(22); + pub const KayahLi: Script = Script(79); + pub const Kharoshthi: Script = Script(57); + pub const KhitanSmallScript: Script = Script(191); + pub const Khmer: Script = Script(23); + pub const Khojki: Script = Script(157); + pub const Khudawadi: Script = Script(145); + pub const Lao: Script = Script(24); + pub const Latin: Script = Script(25); + pub const Lepcha: Script = Script(82); + pub const Limbu: Script = Script(48); + pub const LinearA: Script = Script(83); + pub const LinearB: Script = Script(49); + pub const Lisu: Script = Script(131); + pub const Lycian: Script = Script(107); + pub const Lydian: Script = Script(108); + pub const Mahajani: Script = Script(160); + pub const Makasar: Script = Script(180); + pub const Malayalam: Script = Script(26); + pub const Mandaic: Script = Script(84); + pub const Manichaean: Script = Script(121); + pub const Marchen: Script = Script(169); + pub const MasaramGondi: Script = Script(175); + pub const Medefaidrin: Script = Script(181); + pub const MeeteiMayek: Script = Script(115); + pub const MendeKikakui: Script = Script(140); + pub const MeroiticCursive: Script = Script(141); + pub const MeroiticHieroglyphs: Script = Script(86); + pub const Miao: Script = Script(92); + pub const Modi: Script = Script(163); + pub const Mongolian: Script = Script(27); + pub const Mro: Script = Script(149); + pub const Multani: Script = Script(164); + pub const Myanmar: Script = Script(28); + pub const Nabataean: Script = Script(143); + pub const Nandinagari: Script = Script(187); + pub const NewTaiLue: Script = Script(59); + pub const Newa: Script = Script(170); + pub const Nko: Script = Script(87); + pub const Nushu: Script = Script(150); + pub const NyiakengPuachueHmong: Script = Script(186); + pub const Ogham: Script = Script(29); + pub const OlChiki: Script = Script(109); + pub const OldHungarian: Script = Script(76); + pub const OldItalic: Script = Script(30); + pub const OldNorthArabian: Script = Script(142); + pub const OldPermic: Script = Script(89); + pub const OldPersian: Script = Script(61); + pub const OldSogdian: Script = Script(184); + pub const OldSouthArabian: Script = Script(133); + pub const OldTurkic: Script = Script(88); + pub const OldUyghur: Script = Script(194); + pub const Oriya: Script = Script(31); + pub const Osage: Script = Script(171); + pub const Osmanya: Script = Script(50); + pub const PahawhHmong: Script = Script(75); + pub const Palmyrene: Script = Script(144); + pub const PauCinHau: Script = Script(165); + pub const PhagsPa: Script = Script(90); + pub const Phoenician: Script = Script(91); + pub const PsalterPahlavi: Script = Script(123); + pub const Rejang: Script = Script(110); + pub const Runic: Script = Script(32); + pub const Samaritan: Script = Script(126); + pub const Saurashtra: Script = Script(111); + pub const Sharada: Script = Script(151); + pub const Shavian: Script = Script(51); + pub const Siddham: Script = Script(166); + pub const SignWriting: Script = Script(112); + pub const Sinhala: Script = Script(33); + pub const Sogdian: Script = Script(183); + pub const SoraSompeng: Script = Script(152); + pub const Soyombo: Script = Script(176); + pub const Sundanese: Script = Script(113); + pub const SylotiNagri: Script = Script(58); + pub const Syriac: Script = Script(34); + pub const Tagalog: Script = Script(42); + pub const Tagbanwa: Script = Script(45); + pub const TaiLe: Script = Script(52); + pub const TaiTham: Script = Script(106); + pub const TaiViet: Script = Script(127); + pub const Takri: Script = Script(153); + pub const Tamil: Script = Script(35); + pub const Tangsa: Script = Script(195); + pub const Tangut: Script = Script(154); + pub const Telugu: Script = Script(36); + pub const Thaana: Script = Script(37); + pub const Thai: Script = Script(38); + pub const Tibetan: Script = Script(39); + pub const Tifinagh: Script = Script(60); + pub const Tirhuta: Script = Script(158); + pub const Toto: Script = Script(196); + pub const Ugaritic: Script = Script(53); + pub const Unknown: Script = Script(103); + pub const Vai: Script = Script(99); + pub const Vithkuqi: Script = Script(197); + pub const Wancho: Script = Script(188); + pub const WarangCiti: Script = Script(146); + pub const Yezidi: Script = Script(192); + pub const Yi: Script = Script(41); + pub const ZanabazarSquare: Script = Script(177); } diff --git a/components/uniset/src/lib.rs b/components/uniset/src/lib.rs index 30324d2dabd..f064c36cae4 100644 --- a/components/uniset/src/lib.rs +++ b/components/uniset/src/lib.rs @@ -65,6 +65,7 @@ pub mod enum_props; #[allow(missing_docs)] // TODO(#1030) - Add missing docs. pub mod props; pub mod provider; +mod ule; mod uniset; mod utils; @@ -85,6 +86,10 @@ pub enum UnicodeSetError { InvalidSet(Vec), #[displaydoc("Invalid range: {0}..{1}")] InvalidRange(u32, u32), + #[displaydoc("Unknown script id: {0}")] + UnknownScriptId(u16), + #[displaydoc("Unknown general category set: {0}")] + UnknownGeneralCategorySet(u32), #[displaydoc("{0}")] PropDataLoad(DataError), } diff --git a/components/uniset/src/props.rs b/components/uniset/src/props.rs index 8d803045007..1b490f7baec 100644 --- a/components/uniset/src/props.rs +++ b/components/uniset/src/props.rs @@ -542,6 +542,7 @@ where GeneralCategory::LineSeparator => key::GENERAL_CATEGORY_LINE_SEPARATOR_V1, GeneralCategory::ParagraphSeparator => key::GENERAL_CATEGORY_PARAGRAPH_SEPARATOR_V1, GeneralCategory::SpaceSeparator => key::GENERAL_CATEGORY_SPACE_SEPARATOR_V1, + _ => return Err(UnicodeSetError::UnknownGeneralCategorySet(enum_val.0)), }; get_prop(provider, key) } @@ -715,6 +716,7 @@ where Script::Yezidi => key::SCRIPT_YEZIDI_V1, Script::Yi => key::SCRIPT_YI_V1, Script::ZanabazarSquare => key::SCRIPT_ZANABAZAR_SQUARE_V1, + _ => return Err(UnicodeSetError::UnknownScriptId(enum_val.0)), }; get_prop(provider, key) } diff --git a/components/uniset/src/ule.rs b/components/uniset/src/ule.rs new file mode 100644 index 00000000000..e4c62f0739c --- /dev/null +++ b/components/uniset/src/ule.rs @@ -0,0 +1,60 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::enum_props::{GeneralSubcategory, Script}; +use core::convert::TryFrom; +use num_enum::TryFromPrimitiveError; +use zerovec::ule::{AsULE, PlainOldULE, ULE}; + +#[repr(transparent)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct GeneralSubcategoryULE(u8); + +impl AsULE for GeneralSubcategory { + type ULE = GeneralSubcategoryULE; + + #[inline] + fn as_unaligned(&self) -> Self::ULE { + let u = *self as u8; + GeneralSubcategoryULE(u) + } + + #[inline] + fn from_unaligned(unaligned: &Self::ULE) -> Self { + // Safe because the contents of GeneralSubcategoryULE are required to be valid. + unsafe { Self::from_unchecked(unaligned.0) } + } +} + +// Safety (based on the safety checklist on the ULE trait): +// 1. GeneralSubcategory does not include any uninitialized or padding bytes. +// 2. The impl of validate_byte_slice() returns an error if any byte is not valid. +// Because GeneralSubcategory is repr(u8), any length of byte slice is okay. +// 3. The other ULE methods use the default impl. +// 4. The PartialEq implementation on GeneralSubcategory uses byte equality. +unsafe impl ULE for GeneralSubcategoryULE { + type Error = TryFromPrimitiveError; + + fn validate_byte_slice(bytes: &[u8]) -> Result<(), Self::Error> { + // Validate the bytes + for b in bytes { + GeneralSubcategory::try_from(*b)?; + } + Ok(()) + } +} + +impl AsULE for Script { + type ULE = PlainOldULE<2>; + + #[inline] + fn as_unaligned(&self) -> Self::ULE { + PlainOldULE(self.0.to_le_bytes()) + } + + #[inline] + fn from_unaligned(unaligned: &Self::ULE) -> Self { + Script(u16::from_le_bytes(unaligned.0)) + } +}