diff --git a/unic/ucd/Cargo.toml b/unic/ucd/Cargo.toml index 36d6c699..bd38cc48 100644 --- a/unic/ucd/Cargo.toml +++ b/unic/ucd/Cargo.toml @@ -20,10 +20,11 @@ unic-ucd-age = { path = "age/", version = "0.6.0" } unic-ucd-bidi = { path = "bidi/", version = "0.6.0" } unic-ucd-case = { path = "case/", version = "0.6.0" } unic-ucd-category = { path = "category/", version = "0.6.0" } -unic-ucd-ident = { path = "ident/", version = "0.6.0" } unic-ucd-core = { path = "core/", version = "0.6.0" } +unic-ucd-ident = { path = "ident/", version = "0.6.0" } unic-ucd-name = { path = "name/", version = "0.6.0" } unic-ucd-normal = { path = "normal/", version = "0.6.0", features = ["unic-ucd-category"] } +unic-ucd-segment = { path = "segment/", version = "0.6.0" } [dev-dependencies] unic-utils = { path = "../utils/", version = "0.6.0" } diff --git a/unic/ucd/segment/Cargo.toml b/unic/ucd/segment/Cargo.toml new file mode 100644 index 00000000..d605a55b --- /dev/null +++ b/unic/ucd/segment/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "unic-ucd-segment" +version = "0.6.0" +authors = ["The UNIC Project Developers"] +repository = "https://github.com/behnam/rust-unic/" +license = "MIT/Apache-2.0" +description = "UNIC - Unicode Character Database - Segmentation Properties" +keywords = ["text", "unicode", "character-property", "segmentation", "grapheme"] +categories = ["internationalization", "text-processing", "parsing", "rendering"] + +# No tests/benches that depends on /data/ +exclude = [] + +[badges] +travis-ci = { repository = "behnam/rust-unic", branch = "master" } + +[dependencies] +unic-char-property = { path = "../../char/property/", version = "0.6.0" } +unic-char-range = { path = "../../char/range", version = "0.6.0" } +unic-ucd-core = { path = "../core/", version = "0.6.0" } +unic-utils = { path = "../../utils/", version = "0.6.0" } diff --git a/unic/ucd/segment/src/grapheme_cluster_break.rs b/unic/ucd/segment/src/grapheme_cluster_break.rs new file mode 100644 index 00000000..eb279dfa --- /dev/null +++ b/unic/ucd/segment/src/grapheme_cluster_break.rs @@ -0,0 +1,469 @@ +// Copyright 2017 The UNIC Project Developers. +// +// See the COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! Unicode *Grapheme_Cluster_Break* Character Property. +//! +//! ## References +//! +//! * +//! * +//! * + + +use unic_char_property::TotalCharProperty; + + +char_property! { + /// Represents the Unicode character + /// [*Grapheme_Cluster_Break*](http://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break) + /// property. + /// + /// ## See Also + /// + /// * + /// * + pub enum GraphemeClusterBreak { + abbr => "GCB"; + long => "Grapheme_Cluster_Break"; + human => "Grapheme Cluster Break"; + + /// ```text + /// U+000D CARRIAGE RETURN (CR) + /// ``` + CR { + abbr => CR, + long => CR, + human => "Carriage Return", + } + + /// ```text + /// U+000A LINE FEED (LF) + /// ``` + LF { + abbr => LF, + long => LF, + human => "Line Feed", + } + + /// ```text + /// General_Category = Line_Separator, or + /// General_Category = Paragraph_Separator, or + /// General_Category = Control, or + /// General_Category = Unassigned and Default_Ignorable_Code_Point, or + /// General_Category = Surrogate, or + /// General_Category = Format + /// and not U+000D CARRIAGE RETURN + /// and not U+000A LINE FEED + /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ) + /// and not U+200D ZERO WIDTH JOINER (ZWJ) + /// ``` + Control { + abbr => CN, + long => Control, + human => "Control", + } + + /// ```text + /// Grapheme_Extend = Yes + /// + /// This includes: + /// General_Category = Nonspacing_Mark + /// General_Category = Enclosing_Mark + /// U+200C ZERO WIDTH NON-JOINER + /// plus a few General_Category = Spacing_Mark needed for canonical equivalence. + /// ``` + Extend { + abbr => EX, + long => Extend, + human => "Extend", + } + + /// ```text + /// U+200D ZERO WIDTH JOINER + /// ``` + ZWJ { + abbr => ZWJ, + long => ZWJ, + human => "Zero Width Joiner (ZWJ)", + } + + /// ```text + /// Regional_Indicator = Yes + /// ``` + /// + /// This consists of the range: + /// ```text + /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A + /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z + /// ``` + RegionalIndicator { + abbr => RI, + long => Regional_Indicator, + human => "Regional Indicator", + } + + /// ```text + /// Indic_Syllabic_Category = Consonant_Preceding_Repha, or + /// Indic_Syllabic_Category = Consonant_Prefixed, or + /// Prepended_Concatenation_Mark = Yes + /// ``` + Prepend { + abbr => PP, + long => Prepend, + human => "Prepend", + } + + /// ```text + /// Grapheme_Cluster_Break ≠ Extend, and + /// General_Category = Spacing_Mark, or + /// any of the following (which have General_Category = Other_Letter): + /// U+0E33 ( ำ ) THAI CHARACTER SARA AM + /// U+0EB3 ( ຳ ) LAO VOWEL SIGN AM + /// ``` + /// + /// Exceptions: The following (which have General_Category = Spacing_Mark and would + /// otherwise be included) are specifically excluded: + /// + /// ```text + /// U+102B ( ါ ) MYANMAR VOWEL SIGN TALL AA + /// U+102C ( ာ ) MYANMAR VOWEL SIGN AA + /// U+1038 ( း ) MYANMAR SIGN VISARGA + /// U+1062 ( ၢ ) MYANMAR VOWEL SIGN SGAW KAREN EU + /// ..U+1064 ( ၤ ) MYANMAR TONE MARK SGAW KAREN KE PHO + /// U+1067 ( ၧ ) MYANMAR VOWEL SIGN WESTERN PWO KAREN EU + /// ..U+106D ( ၭ ) MYANMAR SIGN WESTERN PWO KAREN TONE-5 + /// U+1083 ( ႃ ) MYANMAR VOWEL SIGN SHAN AA + /// U+1087 ( ႇ ) MYANMAR SIGN SHAN TONE-2 + /// ..U+108C ( ႌ ) MYANMAR SIGN SHAN COUNCIL TONE-3 + /// U+108F ( ႏ ) MYANMAR SIGN RUMAI PALAUNG TONE-5 + /// U+109A ( ႚ ) MYANMAR SIGN KHAMTI TONE-1 + /// ..U+109C ( ႜ ) MYANMAR VOWEL SIGN AITON A + /// U+1A61 ( ᩡ ) TAI THAM VOWEL SIGN A + /// U+1A63 ( ᩣ ) TAI THAM VOWEL SIGN AA + /// U+1A64 ( ᩤ ) TAI THAM VOWEL SIGN TALL AA + /// U+AA7B ( ꩻ ) MYANMAR SIGN PAO KAREN TONE + /// U+AA7D ( ꩽ ) MYANMAR SIGN TAI LAING TONE-5 + /// U+11720 ( 𑜠 ) AHOM VOWEL SIGN A + /// U+11721 ( 𑜡 ) AHOM VOWEL SIGN AA + /// ``` + SpacingMark { + abbr => SM, + long => SpacingMark, + human => "Spacing Mark", + } + + // Hangul + + /// ```text + /// Hangul_Syllable_Type=L + /// ``` + /// + /// Such as: + /// + /// ```text + /// U+1100 ( ᄀ ) HANGUL CHOSEONG KIYEOK + /// U+115F ( ᅟ ) HANGUL CHOSEONG FILLER + /// U+A960 ( ꥠ ) HANGUL CHOSEONG TIKEUT-MIEUM + /// U+A97C ( ꥼ ) HANGUL CHOSEONG SSANGYEORINHIEUH + /// ``` + L { + abbr => L, + long => L, + human => "Hangul Syllable Type L", + } + + /// ```text + /// Hangul_Syllable_Type=V + /// ``` + /// + /// Such as: + /// + /// ```text + /// U+1160 ( ᅠ ) HANGUL JUNGSEONG FILLER + /// U+11A2 ( ᆢ ) HANGUL JUNGSEONG SSANGARAEA + /// U+D7B0 ( ힰ ) HANGUL JUNGSEONG O-YEO + /// U+D7C6 ( ퟆ ) HANGUL JUNGSEONG ARAEA-E + /// ``` + V { + abbr => V, + long => V, + human => "Hangul Syllable Type V", + } + + /// ```text + /// Hangul_Syllable_Type=T + /// ``` + /// + /// Such as: + /// + /// ```text + /// U+11A8 ( ᆨ ) HANGUL JONGSEONG KIYEOK + /// U+11F9 ( ᇹ ) HANGUL JONGSEONG YEORINHIEUH + /// U+D7CB ( ퟋ ) HANGUL JONGSEONG NIEUN-RIEUL + /// U+D7FB ( ퟻ ) HANGUL JONGSEONG PHIEUPH-THIEUTH + /// ``` + T { + abbr => T, + long => T, + human => "Hangul Syllable Type T", + } + + /// ```text + /// Hangul_Syllable_Type=LV: + /// ``` + /// + /// That is: + /// + /// ```text + /// U+AC00 ( 가 ) HANGUL SYLLABLE GA + /// U+AC1C ( 개 ) HANGUL SYLLABLE GAE + /// U+AC38 ( 갸 ) HANGUL SYLLABLE GYA + /// ... + /// ``` + LV { + abbr => LV, + long => LV, + human => "Hangul Syllable Type LV", + } + + /// ```text + /// Hangul_Syllable_Type=LVT + /// ``` + /// + /// That is: + /// + /// ```text + /// U+AC01 ( 각 ) HANGUL SYLLABLE GAG + /// U+AC02 ( 갂 ) HANGUL SYLLABLE GAGG + /// U+AC03 ( 갃 ) HANGUL SYLLABLE GAGS + /// U+AC04 ( 간 ) HANGUL SYLLABLE GAN + /// ... + /// ``` + LVT { + abbr => LVT, + long => LVT, + human => "Hangul Syllable Type LVT", + } + + // Emoji + + /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not + /// occur after ZWJ in `emoji-zwj-sequences.txt`. + /// + /// See . + EBase { + abbr => EB, + long => E_Base, + human => "Emoji Base", + } + + /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`. + /// + /// See . + EModifier { + abbr => EM, + long => E_Modifier, + human => "Emoji Modifier", + } + + /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence, + /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`. + /// + /// See . + GlueAfterZwj { + abbr => GAZ, + long => Glue_After_Zwj, + human => "Glue After ZWJ", + } + + /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur + /// after ZWJ in `emoji-zwj-sequences.txt`. + /// + /// See . + EBaseGAZ { + abbr => EBG, + long => E_Base_GAZ, + human => "Emoji Base and Glue After ZWJ", + } + + /// Everything else + // TODO: Replace with `Option::None` + Other { + abbr => XX, + long => Other, + human => "Other", + } + + } + + /// Abbreviated name aliases for the + /// [*Grapheme_Cluster_Break*](http://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break) + /// property. + /// + /// ## See Also + /// + /// * + pub mod abbr_names for abbr; + + /// Long name aliases for the + /// [*Grapheme_Cluster_Break*](http://www.unicode.org/reports/tr44/#Grapheme_Cluster_Break) + /// property. + /// + /// ## See Also + /// + /// * + pub mod long_names for long; +} + + +impl TotalCharProperty for GraphemeClusterBreak { + fn of(ch: char) -> Self { + Self::of(ch) + } +} + + +/// Ref: +/// +/// ```text +/// @missing: 0000..10FFFF; Other +/// ``` +impl Default for GraphemeClusterBreak { + #[inline] + fn default() -> Self { + GraphemeClusterBreak::Other + } +} + + +mod data { + use super::long_names as GCB; + use unic_utils::CharDataTable; + pub const GRAPHEME_CLUSTER_BREAK_TABLE: CharDataTable = + include!("../tables/grapheme_cluster_break.rsv"); +} + + +impl GraphemeClusterBreak { + /// Find the character *Grapheme_Cluster_Break* property value. + pub fn of(ch: char) -> GraphemeClusterBreak { + data::GRAPHEME_CLUSTER_BREAK_TABLE.find_or_default(ch) + } +} + + + +#[cfg(test)] +mod tests { + use unic_char_property::EnumeratedCharProperty; + use super::GraphemeClusterBreak as GCB; + + #[test] + fn test_ascii() { + assert_eq!(GCB::of('\u{0000}'), GCB::Control); + assert_eq!(GCB::of('\u{0040}'), GCB::Other); + assert_eq!(GCB::of('\u{0041}'), GCB::Other); + assert_eq!(GCB::of('\u{0062}'), GCB::Other); + assert_eq!(GCB::of('\u{007F}'), GCB::Control); + } + + #[test] + fn test_bmp() { + // Hebrew + assert_eq!(GCB::of('\u{0590}'), GCB::Other); + assert_eq!(GCB::of('\u{05D0}'), GCB::Other); + assert_eq!(GCB::of('\u{05D1}'), GCB::Other); + assert_eq!(GCB::of('\u{05FF}'), GCB::Other); + + // Arabic + assert_eq!(GCB::of('\u{0600}'), GCB::Prepend); + assert_eq!(GCB::of('\u{0627}'), GCB::Other); + assert_eq!(GCB::of('\u{07BF}'), GCB::Other); + + // Default R + Arabic Extras + assert_eq!(GCB::of('\u{07C0}'), GCB::Other); + assert_eq!(GCB::of('\u{085F}'), GCB::Other); + assert_eq!(GCB::of('\u{0860}'), GCB::Other); + assert_eq!(GCB::of('\u{0870}'), GCB::Other); + assert_eq!(GCB::of('\u{089F}'), GCB::Other); + assert_eq!(GCB::of('\u{08A0}'), GCB::Other); + assert_eq!(GCB::of('\u{089F}'), GCB::Other); + assert_eq!(GCB::of('\u{08FF}'), GCB::Extend); + + // Default ET + assert_eq!(GCB::of('\u{20A0}'), GCB::Other); + assert_eq!(GCB::of('\u{20CF}'), GCB::Other); + + // Arabic Presentation Forms + assert_eq!(GCB::of('\u{FB1D}'), GCB::Other); + assert_eq!(GCB::of('\u{FB4F}'), GCB::Other); + assert_eq!(GCB::of('\u{FB50}'), GCB::Other); + assert_eq!(GCB::of('\u{FDCF}'), GCB::Other); + assert_eq!(GCB::of('\u{FDF0}'), GCB::Other); + assert_eq!(GCB::of('\u{FDFF}'), GCB::Other); + assert_eq!(GCB::of('\u{FE70}'), GCB::Other); + assert_eq!(GCB::of('\u{FEFE}'), GCB::Other); + assert_eq!(GCB::of('\u{FEFF}'), GCB::Control); + + // noncharacters + assert_eq!(GCB::of('\u{FDD0}'), GCB::Other); + assert_eq!(GCB::of('\u{FDD1}'), GCB::Other); + assert_eq!(GCB::of('\u{FDEE}'), GCB::Other); + assert_eq!(GCB::of('\u{FDEF}'), GCB::Other); + assert_eq!(GCB::of('\u{FFFE}'), GCB::Other); + assert_eq!(GCB::of('\u{FFFF}'), GCB::Other); + } + + #[test] + fn test_smp() { + // Default AL + R + assert_eq!(GCB::of('\u{10800}'), GCB::Other); + assert_eq!(GCB::of('\u{10FFF}'), GCB::Other); + assert_eq!(GCB::of('\u{1E800}'), GCB::Other); + assert_eq!(GCB::of('\u{1EDFF}'), GCB::Other); + assert_eq!(GCB::of('\u{1EE00}'), GCB::Other); + assert_eq!(GCB::of('\u{1EEFF}'), GCB::Other); + assert_eq!(GCB::of('\u{1EF00}'), GCB::Other); + assert_eq!(GCB::of('\u{1EFFF}'), GCB::Other); + } + + #[test] + fn test_unassigned_planes() { + assert_eq!(GCB::of('\u{30000}'), GCB::Other); + assert_eq!(GCB::of('\u{40000}'), GCB::Other); + assert_eq!(GCB::of('\u{50000}'), GCB::Other); + assert_eq!(GCB::of('\u{60000}'), GCB::Other); + assert_eq!(GCB::of('\u{70000}'), GCB::Other); + assert_eq!(GCB::of('\u{80000}'), GCB::Other); + assert_eq!(GCB::of('\u{90000}'), GCB::Other); + assert_eq!(GCB::of('\u{a0000}'), GCB::Other); + } + + #[test] + fn test_abbr_name() { + assert_eq!(GCB::Other.abbr_name(), "XX"); + } + + #[test] + fn test_long_name() { + assert_eq!(GCB::Other.long_name(), "Other"); + } + + #[test] + fn test_human_name() { + assert_eq!(GCB::Other.human_name(), "Other"); + } + + #[test] + fn test_display() { + assert_eq!(format!("{}", GCB::Other), "Other"); + } +} diff --git a/unic/ucd/segment/src/lib.rs b/unic/ucd/segment/src/lib.rs new file mode 100644 index 00000000..cc020eb5 --- /dev/null +++ b/unic/ucd/segment/src/lib.rs @@ -0,0 +1,45 @@ +// Copyright 2017 The UNIC Project Developers. +// +// See the COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +#![forbid(unsafe_code, unconditional_recursion)] +#![deny(missing_docs)] + +//! # UNIC — UCD — Segmentation Properties" +//! +//! A component of [`unic`: Unicode and Internationalization Crates for Rust](/unic/). +//! +//! Accessor for Text Segmentation character properties from Unicode Character Database (UCD). + +#[macro_use] +extern crate unic_char_property; + +#[macro_use] +extern crate unic_char_range; + +extern crate unic_ucd_core; +extern crate unic_utils; + + +pub mod grapheme_cluster_break; +pub mod sentence_break; +pub mod word_break; + + +pub use grapheme_cluster_break::GraphemeClusterBreak; +pub use sentence_break::SentenceBreak; +pub use word_break::WordBreak; + + +use unic_ucd_core::UnicodeVersion; + + +/// The [Unicode version](http://www.unicode.org/versions/) of data +pub const UNICODE_VERSION: UnicodeVersion = include!("../tables/unicode_version.rsv"); diff --git a/unic/ucd/segment/src/sentence_break.rs b/unic/ucd/segment/src/sentence_break.rs new file mode 100644 index 00000000..3cb325c6 --- /dev/null +++ b/unic/ucd/segment/src/sentence_break.rs @@ -0,0 +1,385 @@ +// Copyright 2017 The UNIC Project Developers. +// +// See the COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! Unicode *Sentence_Break* Character Property. +//! +//! ## References +//! +//! * +//! * +//! * + + +use unic_char_property::TotalCharProperty; + + +char_property! { + /// Represents the Unicode character + /// [*Sentence_Break*](http://www.unicode.org/reports/tr44/#Sentence_Break) + /// property. + /// + /// ## See Also + /// + /// * + /// * + pub enum SentenceBreak { + abbr => "SB"; + long => "Sentence_Break"; + human => "Sentence Break"; + + /// ```text + /// U+000D CARRIAGE RETURN (CR) + /// ``` + CR { + abbr => CR, + long => CR, + human => "Carriage Return", + } + + /// ```text + /// U+000A LINE FEED (LF) + /// ``` + LF { + abbr => LF, + long => LF, + human => "Line Feed", + } + + /// ```text + /// Grapheme_Extend = Yes, or + /// U+200D ZERO WIDTH JOINER (ZWJ), or + /// General_Category = Spacing_Mark + /// ``` + Extend { + abbr => Extend, + long => Extend, + human => "Extend", + } + + /// ```text + /// U+0085 NEXT LINE (NEL) + /// U+2028 LINE SEPARATOR + /// U+2029 PARAGRAPH SEPARATOR + /// ``` + Sep { + abbr => SE, + long => Sep, + human => "Separator", + } + + /// ```text + /// General_Category = Format + /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ) + /// and not U+200D ZERO WIDTH JOINER (ZWJ) + /// ``` + Format { + abbr => FO, + long => Format, + human => "Format", + } + + /// ```text + /// White_Space = Yes + /// and Sentence_Break ≠ Sep + /// and Sentence_Break ≠ CR + /// and Sentence_Break ≠ LF + /// ``` + Sp { + abbr => SP, + long => Sp, + human => "Space", + } + + /// ```text + /// Lowercase = Yes + /// and Grapheme_Extend = No + /// ``` + Lower { + abbr => LO, + long => Lower, + human => "Lowercase", + } + + /// ```text + /// General_Category = Titlecase_Letter, or + /// Uppercase = Yes + /// ``` + Upper { + abbr => UP, + long => Upper, + human => "Uppercase", + } + + /// ```text + /// Alphabetic = Yes, or + /// U+00A0 NO-BREAK SPACE (NBSP), or + /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH + /// and Lower = No + /// and Upper = No + /// and Sentence_Break ≠ Extend + /// ``` + OLetter { + abbr => LE, + long => OLetter, + human => "Other Letter", + } + + /// ```text + /// Line_Break = Numeric + /// ``` + Numeric { + abbr => NU, + long => Numeric, + human => "Numeric", + } + + /// ```text + /// U+002E ( . ) FULL STOP + /// U+2024 ( ․ ) ONE DOT LEADER + /// U+FE52 ( ﹒ ) SMALL FULL STOP + /// U+FF0E ( . ) FULLWIDTH FULL STOP + /// ``` + ATerm { + abbr => AT, + long => ATerm, + human => "ATerm", + } + + /// ```text + /// U+002C ( , ) COMMA + /// U+002D ( - ) HYPHEN-MINUS + /// U+003A ( : ) COLON + /// U+055D ( ՝ ) ARMENIAN COMMA + /// U+060C ( ، ) ARABIC COMMA + /// U+060D ( ‎؍‎ ) ARABIC DATE SEPARATOR + /// U+07F8 ( ߸ ) NKO COMMA + /// U+1802 ( ᠂ ) MONGOLIAN COMMA + /// U+1808 ( ᠈ ) MONGOLIAN MANCHU COMMA + /// U+2013 ( – ) EN DASH + /// U+2014 ( — ) EM DASH + /// U+3001 ( 、 ) IDEOGRAPHIC COMMA + /// U+FE10 ( ︐ ) PRESENTATION FORM FOR VERTICAL COMMA + /// U+FE11 ( ︑ ) PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA + /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON + /// U+FE31 ( ︱ ) PRESENTATION FORM FOR VERTICAL EM DASH + /// U+FE32 ( ︲ ) PRESENTATION FORM FOR VERTICAL EN DASH + /// U+FE50 ( ﹐ ) SMALL COMMA + /// U+FE51 ( ﹑ ) SMALL IDEOGRAPHIC COMMA + /// U+FE55 ( ﹕ ) SMALL COLON + /// U+FE58 ( ﹘ ) SMALL EM DASH + /// U+FE63 ( ﹣ ) SMALL HYPHEN-MINUS + /// U+FF0C ( , ) FULLWIDTH COMMA + /// U+FF0D ( - ) FULLWIDTH HYPHEN-MINUS + /// U+FF1A ( : ) FULLWIDTH COLON + /// U+FF64 ( 、 ) HALFWIDTH IDEOGRAPHIC COMMA + /// ``` + SContinue { + abbr => SC, + long => SContinue, + human => "Sentence Continue", + } + + /// ```text + /// Sentence_Terminal = Yes + /// ``` + STerm { + abbr => ST, + long => STerm, + human => "Sentence Terminal", + } + + /// ```text + /// General_Category = Open_Punctuation, or + /// General_Category = Close_Punctuation, or + /// Line_Break = Quotation + /// and not U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH + /// and ATerm = No + /// and STerm = No + /// ``` + Close { + abbr => CL, + long => Close, + human => "Close", + } + + /// Everything else + // TODO: Replace with `Option::None` + Other { + abbr => XX, + long => Other, + human => "Other", + } + + } + + /// Abbreviated name aliases for the + /// [*Sentence_Break*](http://www.unicode.org/reports/tr44/#Sentence_Break) + /// property. + /// + /// ## See Also + /// + /// * + pub mod abbr_names for abbr; + + /// Long name aliases for the + /// [*Sentence_Break*](http://www.unicode.org/reports/tr44/#Sentence_Break) + /// property. + /// + /// ## See Also + /// + /// * + pub mod long_names for long; +} + + +impl TotalCharProperty for SentenceBreak { + fn of(ch: char) -> Self { + Self::of(ch) + } +} + + +/// Ref: +/// +/// ```text +/// @missing: 0000..10FFFF; Other +/// ``` +impl Default for SentenceBreak { + #[inline] + fn default() -> Self { + SentenceBreak::Other + } +} + + +mod data { + use super::long_names as SB; + use unic_utils::CharDataTable; + pub const SENTENCE_BREAK_TABLE: CharDataTable = + include!("../tables/sentence_break.rsv"); +} + + +impl SentenceBreak { + /// Find the character *Sentence_Break* property value. + pub fn of(ch: char) -> SentenceBreak { + data::SENTENCE_BREAK_TABLE.find_or_default(ch) + } +} + + + +#[cfg(test)] +mod tests { + use unic_char_property::EnumeratedCharProperty; + use super::SentenceBreak as SB; + + #[test] + fn test_ascii() { + assert_eq!(SB::of('\u{0000}'), SB::Other); + assert_eq!(SB::of('\u{0040}'), SB::Other); + assert_eq!(SB::of('\u{0041}'), SB::Upper); + assert_eq!(SB::of('\u{0062}'), SB::Lower); + assert_eq!(SB::of('\u{007F}'), SB::Other); + } + + #[test] + fn test_bmp() { + // Hebrew + assert_eq!(SB::of('\u{0590}'), SB::Other); + assert_eq!(SB::of('\u{05D0}'), SB::OLetter); + assert_eq!(SB::of('\u{05D1}'), SB::OLetter); + assert_eq!(SB::of('\u{05FF}'), SB::Other); + + // Arabic + assert_eq!(SB::of('\u{0600}'), SB::Format); + assert_eq!(SB::of('\u{0627}'), SB::OLetter); + assert_eq!(SB::of('\u{07BF}'), SB::Other); + + // Default R + Arabic Extras + assert_eq!(SB::of('\u{07C0}'), SB::Numeric); + assert_eq!(SB::of('\u{085F}'), SB::Other); + assert_eq!(SB::of('\u{0860}'), SB::OLetter); + assert_eq!(SB::of('\u{0870}'), SB::Other); + assert_eq!(SB::of('\u{089F}'), SB::Other); + assert_eq!(SB::of('\u{08A0}'), SB::OLetter); + assert_eq!(SB::of('\u{089F}'), SB::Other); + assert_eq!(SB::of('\u{08FF}'), SB::Extend); + + // Default ET + assert_eq!(SB::of('\u{20A0}'), SB::Other); + assert_eq!(SB::of('\u{20CF}'), SB::Other); + + // Arabic Presentation Forms + assert_eq!(SB::of('\u{FB1D}'), SB::OLetter); + assert_eq!(SB::of('\u{FB4F}'), SB::OLetter); + assert_eq!(SB::of('\u{FB50}'), SB::OLetter); + assert_eq!(SB::of('\u{FDCF}'), SB::Other); + assert_eq!(SB::of('\u{FDF0}'), SB::OLetter); + assert_eq!(SB::of('\u{FDFF}'), SB::Other); + assert_eq!(SB::of('\u{FE70}'), SB::OLetter); + assert_eq!(SB::of('\u{FEFE}'), SB::Other); + assert_eq!(SB::of('\u{FEFF}'), SB::Format); + + // noncharacters + assert_eq!(SB::of('\u{FDD0}'), SB::Other); + assert_eq!(SB::of('\u{FDD1}'), SB::Other); + assert_eq!(SB::of('\u{FDEE}'), SB::Other); + assert_eq!(SB::of('\u{FDEF}'), SB::Other); + assert_eq!(SB::of('\u{FFFE}'), SB::Other); + assert_eq!(SB::of('\u{FFFF}'), SB::Other); + } + + #[test] + fn test_smp() { + // Default AL + R + assert_eq!(SB::of('\u{10800}'), SB::OLetter); + assert_eq!(SB::of('\u{10FFF}'), SB::Other); + assert_eq!(SB::of('\u{1E800}'), SB::OLetter); + assert_eq!(SB::of('\u{1EDFF}'), SB::Other); + assert_eq!(SB::of('\u{1EE00}'), SB::OLetter); + assert_eq!(SB::of('\u{1EEFF}'), SB::Other); + assert_eq!(SB::of('\u{1EF00}'), SB::Other); + assert_eq!(SB::of('\u{1EFFF}'), SB::Other); + } + + #[test] + fn test_unassigned_planes() { + assert_eq!(SB::of('\u{30000}'), SB::Other); + assert_eq!(SB::of('\u{40000}'), SB::Other); + assert_eq!(SB::of('\u{50000}'), SB::Other); + assert_eq!(SB::of('\u{60000}'), SB::Other); + assert_eq!(SB::of('\u{70000}'), SB::Other); + assert_eq!(SB::of('\u{80000}'), SB::Other); + assert_eq!(SB::of('\u{90000}'), SB::Other); + assert_eq!(SB::of('\u{a0000}'), SB::Other); + } + + #[test] + fn test_abbr_name() { + assert_eq!(SB::Other.abbr_name(), "XX"); + } + + #[test] + fn test_long_name() { + assert_eq!(SB::Other.long_name(), "Other"); + } + + #[test] + fn test_human_name() { + assert_eq!(SB::Other.human_name(), "Other"); + } + + #[test] + fn test_display() { + assert_eq!(format!("{}", SB::Other), "Other"); + } +} diff --git a/unic/ucd/segment/src/word_break.rs b/unic/ucd/segment/src/word_break.rs new file mode 100644 index 00000000..add096e2 --- /dev/null +++ b/unic/ucd/segment/src/word_break.rs @@ -0,0 +1,477 @@ +// Copyright 2017 The UNIC Project Developers. +// +// See the COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! Unicode *Word_Break* Character Property. +//! +//! ## References +//! +//! * +//! * +//! * + + +use unic_char_property::TotalCharProperty; + + +char_property! { + /// Represents the Unicode character + /// [*Word_Break*](http://www.unicode.org/reports/tr44/#Word_Break) + /// property. + /// + /// ## See Also + /// + /// * + /// * + pub enum WordBreak { + abbr => "WB"; + long => "Word_Break"; + human => "Word Break"; + + /// ```text + /// U+000D CARRIAGE RETURN (CR) + /// ``` + CR { + abbr => CR, + long => CR, + human => "Carriage Return", + } + + /// ```text + /// U+000A LINE FEED (LF) + /// ``` + LF { + abbr => LF, + long => LF, + human => "Line Feed", + } + + /// ```text + /// U+000B LINE TABULATION + /// U+000C FORM FEED (FF) + /// U+0085 NEXT LINE (NEL) + /// U+2028 LINE SEPARATOR + /// U+2029 PARAGRAPH SEPARATOR + /// ``` + Newline { + abbr => NL, + long => Newline, + human => "Newline", + } + + /// ```text + /// Grapheme_Extend = Yes, or + /// General_Category = Spacing_Mark + /// and not U+200D ZERO WIDTH JOINER (ZWJ) + /// ``` + Extend { + abbr => Extend, + long => Extend, + human => "Extend", + } + + /// ```text + /// U+200D ZERO WIDTH JOINER + /// ``` + ZWJ { + abbr => ZWJ, + long => ZWJ, + human => "Zero Width Joiner (ZWJ)", + } + + /// ```text + /// Regional_Indicator = Yes + /// ``` + /// + /// This consists of the range: + /// ```text + /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A + /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z + /// ``` + RegionalIndicator { + abbr => RI, + long => Regional_Indicator, + human => "Regional Indicator", + } + + /// ```text + /// General_Category = Format + /// and not U+200B ZERO WIDTH SPACE (ZWSP) + /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ) + /// and not U+200D ZERO WIDTH JOINER (ZWJ) + /// ``` + Format { + abbr => FO, + long => Format, + human => "Format", + } + + /// ```text + /// Script = KATAKANA, or + /// any of the following: + /// U+3031 ( 〱 ) VERTICAL KANA REPEAT MARK + /// U+3032 ( 〲 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK + /// U+3033 ( 〳 ) VERTICAL KANA REPEAT MARK UPPER HALF + /// U+3034 ( 〴 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF + /// U+3035 ( 〵 ) VERTICAL KANA REPEAT MARK LOWER HALF + /// U+309B ( ゛ ) KATAKANA-HIRAGANA VOICED SOUND MARK + /// U+309C ( ゜ ) KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + /// U+30A0 ( ゠ ) KATAKANA-HIRAGANA DOUBLE HYPHEN + /// U+30FC ( ー ) KATAKANA-HIRAGANA PROLONGED SOUND MARK + /// U+FF70 ( ー ) HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK + /// ``` + Katakana { + abbr => KA, + long => Katakana, + human => "Katakana", + } + + /// ```text + /// Script = Hebrew + /// and General_Category = Other_Letter + /// ``` + HebrewLetter { + abbr => HL, + long => Hebrew_Letter, + human => "Hebrew Letter", + } + + /// ```text + /// Alphabetic = Yes, or + /// any of the following 36 characters: + /// U+02C2 ( ˂ ) MODIFIER LETTER LEFT ARROWHEAD + /// ..U+02C5 ( ˅ ) MODIFIER LETTER DOWN ARROWHEAD + /// U+02D2 ( ˒ ) MODIFIER LETTER CENTRED RIGHT HALF RING + /// ..U+02D7 ( ˗ ) MODIFIER LETTER MINUS SIGN + /// U+02DE ( ˞ ) MODIFIER LETTER RHOTIC HOOK + /// U+02DF ( ˟ ) MODIFIER LETTER CROSS ACCENT + /// U+02ED ( ˭ ) MODIFIER LETTER UNASPIRATED + /// U+02EF ( ˯ ) MODIFIER LETTER LOW DOWN ARROWHEAD + /// ..U+02FF ( ˿ ) MODIFIER LETTER LOW LEFT ARROW + /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH + /// U+A720 ( ꜠ ) MODIFIER LETTER STRESS AND HIGH TONE + /// U+A721 ( ꜡ ) MODIFIER LETTER STRESS AND LOW TONE + /// U+A789 ( ꞉ ) MODIFIER LETTER COLON + /// U+A78A ( ꞊ ) MODIFIER LETTER SHORT EQUALS SIGN + /// U+AB5B ( ꭛ ) MODIFIER BREVE WITH INVERTED BREVE + /// and Ideographic = No + /// and Word_Break ≠ Katakana + /// and Line_Break ≠ Complex_Context (SA) + /// and Script ≠ Hiragana + /// and Word_Break ≠ Extend + /// and Word_Break ≠ Hebrew_Letter + /// ``` + ALetter { + abbr => LE, + long => ALetter, + human => "Alphabetic Letter", + } + + /// ```text + /// U+0027 ( ' ) APOSTROPHE + /// ``` + SingleQuote { + abbr => SQ, + long => Single_Quote, + human => "Single Quote", + } + + /// ```text + /// U+0022 ( " ) QUOTATION MARK + /// ``` + DQ { + abbr => DQ, + long => Double_Quote, + human => "Double Quote", + } + + /// ```text + /// U+002E ( . ) FULL STOP + /// U+2018 ( ‘ ) LEFT SINGLE QUOTATION MARK + /// U+2019 ( ’ ) RIGHT SINGLE QUOTATION MARK + /// U+2024 ( ․ ) ONE DOT LEADER + /// U+FE52 ( ﹒ ) SMALL FULL STOP + /// U+FF07 ( ' ) FULLWIDTH APOSTROPHE + /// U+FF0E ( . ) FULLWIDTH FULL STOP + /// ``` + MidNumLet { + abbr => MB, + long => MidNumLet, + human => "Middle of Numeric/Letter", + } + + /// ```text + /// U+00B7 ( · ) MIDDLE DOT + /// U+0387 ( · ) GREEK ANO TELEIA + /// U+05F4 ( ״ ) HEBREW PUNCTUATION GERSHAYIM + /// U+2027 ( ‧ ) HYPHENATION POINT + /// U+003A ( : ) COLON (used in Swedish) + /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON + /// U+FE55 ( ﹕ ) SMALL COLON + /// U+FF1A ( : ) FULLWIDTH COLON + /// ``` + MidLetter { + abbr => ML, + long => MidLetter, + human => "Middle of Letter", + } + + /// ```text + /// Line_Break = Infix_Numeric, or + /// any of the following: + /// U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR + /// U+FE50 ( ﹐ ) SMALL COMMA + /// U+FE54 ( ﹔ ) SMALL SEMICOLON + /// U+FF0C ( , ) FULLWIDTH COMMA + /// U+FF1B ( ; ) FULLWIDTH SEMICOLON + /// and not U+003A ( : ) COLON + /// and not U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON + /// and not U+002E ( . ) FULL STOP + /// ``` + MidNum { + abbr => MN, + long => MidNum, + human => "Middle of Numeric", + } + + /// ```text + /// Line_Break = Numeric + /// and not U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR + /// ``` + Numeric { + abbr => NU, + long => Numeric, + human => "Numeric", + } + + /// ```text + /// General_Category = Connector_Punctuation, or + /// U+202F NARROW NO-BREAK SPACE (NNBSP) + /// ``` + ExtendNumLet { + abbr => EX, + long => ExtendNumLet, + human => "Extend Numeric/Letter", + } + + // Emoji + + /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not + /// occur after ZWJ in `emoji-zwj-sequences.txt`. + /// + /// See . + EBase { + abbr => EB, + long => E_Base, + human => "Emoji Base", + } + + /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`. + /// + /// See . + EModifier { + abbr => EM, + long => E_Modifier, + human => "Emoji Modifier", + } + + /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence, + /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`. + /// + /// See . + GlueAfterZwj { + abbr => GAZ, + long => Glue_After_Zwj, + human => "Glue After ZWJ", + } + + /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur + /// after ZWJ in `emoji-zwj-sequences.txt`. + /// + /// See . + EBaseGAZ { + abbr => EBG, + long => E_Base_GAZ, + human => "Emoji Base and Glue After ZWJ", + } + + /// Everything else + // TODO: Replace with `Option::None` + Other { + abbr => XX, + long => Other, + human => "Other", + } + + } + + /// Abbreviated name aliases for the + /// [*Word_Break*](http://www.unicode.org/reports/tr44/#Word_Break) + /// property. + /// + /// ## See Also + /// + /// * + pub mod abbr_names for abbr; + + /// Long name aliases for the + /// [*Word_Break*](http://www.unicode.org/reports/tr44/#Word_Break) + /// property. + /// + /// ## See Also + /// + /// * + pub mod long_names for long; +} + + +impl TotalCharProperty for WordBreak { + fn of(ch: char) -> Self { + Self::of(ch) + } +} + + +/// Ref: +/// +/// ```text +/// @missing: 0000..10FFFF; Other +/// ``` +impl Default for WordBreak { + #[inline] + fn default() -> Self { + WordBreak::Other + } +} + + +mod data { + use super::long_names as WB; + use unic_utils::CharDataTable; + pub const WORD_BREAK_TABLE: CharDataTable = + include!("../tables/word_break.rsv"); +} + + +impl WordBreak { + /// Find the character *Word_Break* property value. + pub fn of(ch: char) -> WordBreak { + data::WORD_BREAK_TABLE.find_or_default(ch) + } +} + + + +#[cfg(test)] +mod tests { + use unic_char_property::EnumeratedCharProperty; + use super::WordBreak as WB; + + #[test] + fn test_ascii() { + assert_eq!(WB::of('\u{0000}'), WB::Other); + assert_eq!(WB::of('\u{0040}'), WB::Other); + assert_eq!(WB::of('\u{0041}'), WB::ALetter); + assert_eq!(WB::of('\u{0062}'), WB::ALetter); + assert_eq!(WB::of('\u{007F}'), WB::Other); + } + + #[test] + fn test_bmp() { + // Hebrew + assert_eq!(WB::of('\u{0590}'), WB::Other); + assert_eq!(WB::of('\u{05D0}'), WB::HebrewLetter); + assert_eq!(WB::of('\u{05D1}'), WB::HebrewLetter); + assert_eq!(WB::of('\u{05FF}'), WB::Other); + + // Arabic + assert_eq!(WB::of('\u{0600}'), WB::Format); + assert_eq!(WB::of('\u{0627}'), WB::ALetter); + assert_eq!(WB::of('\u{07BF}'), WB::Other); + + // Default R + Arabic Extras + assert_eq!(WB::of('\u{07C0}'), WB::Numeric); + assert_eq!(WB::of('\u{085F}'), WB::Other); + assert_eq!(WB::of('\u{0860}'), WB::ALetter); + assert_eq!(WB::of('\u{0870}'), WB::Other); + assert_eq!(WB::of('\u{089F}'), WB::Other); + assert_eq!(WB::of('\u{08A0}'), WB::ALetter); + assert_eq!(WB::of('\u{089F}'), WB::Other); + assert_eq!(WB::of('\u{08FF}'), WB::Extend); + + // Default ET + assert_eq!(WB::of('\u{20A0}'), WB::Other); + assert_eq!(WB::of('\u{20CF}'), WB::Other); + + // Arabic Presentation Forms + assert_eq!(WB::of('\u{FB1D}'), WB::HebrewLetter); + assert_eq!(WB::of('\u{FB4F}'), WB::HebrewLetter); + assert_eq!(WB::of('\u{FB50}'), WB::ALetter); + assert_eq!(WB::of('\u{FDCF}'), WB::Other); + assert_eq!(WB::of('\u{FDF0}'), WB::ALetter); + assert_eq!(WB::of('\u{FDFF}'), WB::Other); + assert_eq!(WB::of('\u{FE70}'), WB::ALetter); + assert_eq!(WB::of('\u{FEFE}'), WB::Other); + assert_eq!(WB::of('\u{FEFF}'), WB::Format); + + // noncharacters + assert_eq!(WB::of('\u{FDD0}'), WB::Other); + assert_eq!(WB::of('\u{FDD1}'), WB::Other); + assert_eq!(WB::of('\u{FDEE}'), WB::Other); + assert_eq!(WB::of('\u{FDEF}'), WB::Other); + assert_eq!(WB::of('\u{FFFE}'), WB::Other); + assert_eq!(WB::of('\u{FFFF}'), WB::Other); + } + + #[test] + fn test_smp() { + // Default AL + R + assert_eq!(WB::of('\u{10800}'), WB::ALetter); + assert_eq!(WB::of('\u{10FFF}'), WB::Other); + assert_eq!(WB::of('\u{1E800}'), WB::ALetter); + assert_eq!(WB::of('\u{1EDFF}'), WB::Other); + assert_eq!(WB::of('\u{1EE00}'), WB::ALetter); + assert_eq!(WB::of('\u{1EEFF}'), WB::Other); + assert_eq!(WB::of('\u{1EF00}'), WB::Other); + assert_eq!(WB::of('\u{1EFFF}'), WB::Other); + } + + #[test] + fn test_unassigned_planes() { + assert_eq!(WB::of('\u{30000}'), WB::Other); + assert_eq!(WB::of('\u{40000}'), WB::Other); + assert_eq!(WB::of('\u{50000}'), WB::Other); + assert_eq!(WB::of('\u{60000}'), WB::Other); + assert_eq!(WB::of('\u{70000}'), WB::Other); + assert_eq!(WB::of('\u{80000}'), WB::Other); + assert_eq!(WB::of('\u{90000}'), WB::Other); + assert_eq!(WB::of('\u{a0000}'), WB::Other); + } + + #[test] + fn test_abbr_name() { + assert_eq!(WB::Other.abbr_name(), "XX"); + } + + #[test] + fn test_long_name() { + assert_eq!(WB::Other.long_name(), "Other"); + } + + #[test] + fn test_human_name() { + assert_eq!(WB::Other.human_name(), "Other"); + } + + #[test] + fn test_display() { + assert_eq!(format!("{}", WB::Other), "Other"); + } +} diff --git a/unic/ucd/src/lib.rs b/unic/ucd/src/lib.rs index 7bdaa396..36aee032 100644 --- a/unic/ucd/src/lib.rs +++ b/unic/ucd/src/lib.rs @@ -26,6 +26,7 @@ pub extern crate unic_ucd_category as category; pub extern crate unic_ucd_core as core; pub extern crate unic_ucd_name as name; pub extern crate unic_ucd_normal as normal; +pub extern crate unic_ucd_segment as segment; /// The [Unicode version](http://www.unicode.org/versions/) of data @@ -74,3 +75,5 @@ pub use category::GeneralCategory; pub use name::Name; pub use normal::CanonicalCombiningClass; + +pub use segment::{GraphemeClusterBreak, SentenceBreak, WordBreak};