Skip to content

Commit

Permalink
[regexp] Replace unicode property comparison instructions with generi…
Browse files Browse the repository at this point in the history
…c set comparisons
  • Loading branch information
Hans-Halverson committed Jan 19, 2025
1 parent a9d8374 commit a5b7658
Show file tree
Hide file tree
Showing 11 changed files with 422 additions and 348 deletions.
3 changes: 3 additions & 0 deletions icu/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ include!("xid_continue_v1_marker.rs.data");
include!("xid_start_v1_marker.rs.data");
include!("case_map_v1_marker.rs.data");
include!("general_category_v1_marker.rs.data");
include!("script_v1_marker.rs.data");
include!("script_with_extensions_property_v1_marker.rs.data");
/// Marks a type as a data provider. You can then use macros like
/// `impl_core_helloworld_v1` to add implementations.
Expand Down Expand Up @@ -159,6 +160,7 @@ macro_rules! impl_data_provider {
impl_xid_start_v1_marker!($provider);
impl_case_map_v1_marker!($provider);
impl_general_category_v1_marker!($provider);
impl_script_v1_marker!($provider);
impl_script_with_extensions_property_v1_marker!($provider);
};
}
Expand Down Expand Up @@ -234,6 +236,7 @@ macro_rules! impl_any_provider {
h if h == <icu_properties::provider::XidStartV1Marker as icu_provider::DataMarker>::INFO.path.hashed() => icu_provider::DataProvider::<icu_properties::provider::XidStartV1Marker>::load(self, req).map(icu_provider::DataResponse::wrap_into_any_response),
h if h == <icu_casemap::provider::CaseMapV1Marker as icu_provider::DataMarker>::INFO.path.hashed() => icu_provider::DataProvider::<icu_casemap::provider::CaseMapV1Marker>::load(self, req).map(icu_provider::DataResponse::wrap_into_any_response),
h if h == <icu_properties::provider::GeneralCategoryV1Marker as icu_provider::DataMarker>::INFO.path.hashed() => icu_provider::DataProvider::<icu_properties::provider::GeneralCategoryV1Marker>::load(self, req).map(icu_provider::DataResponse::wrap_into_any_response),
h if h == <icu_properties::provider::ScriptV1Marker as icu_provider::DataMarker>::INFO.path.hashed() => icu_provider::DataProvider::<icu_properties::provider::ScriptV1Marker>::load(self, req).map(icu_provider::DataResponse::wrap_into_any_response),
h if h == <icu_properties::provider::ScriptWithExtensionsPropertyV1Marker as icu_provider::DataMarker>::INFO.path.hashed() => icu_provider::DataProvider::<icu_properties::provider::ScriptWithExtensionsPropertyV1Marker>::load(self, req).map(icu_provider::DataResponse::wrap_into_any_response),
_ => Err(icu_provider::DataErrorKind::MarkerNotFound.with_req(marker, req)),
}
Expand Down
99 changes: 99 additions & 0 deletions icu/data/script_v1_marker.rs.data

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions icu/markers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ normalizer/nfkd@1
normalizer/nfkdex@1
props/casemap@1
props/gc@1
props/sc@1
props/scx@1
propnames/from/sc@2
props/AHex@1
Expand Down
89 changes: 9 additions & 80 deletions src/js/common/icu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,87 +31,13 @@ pub struct ICU {
pub struct GeneralCategories {
/// Classifier which maps code points to general categories
pub classifier: CodePointMapDataBorrowed<'static, GeneralCategory>,
// /// The C general category
// pub other: CodePointSetDataBorrowed<'static>,
// /// The Cc general category
// pub control: CodePointSetDataBorrowed<'static>,
// /// The Cf general category
// pub format: CodePointSetDataBorrowed<'static>,
// /// The Cn general category
// pub unassigned: CodePointSetDataBorrowed<'static>,
// /// The Co general category
// pub private_use: CodePointSetDataBorrowed<'static>,
// /// The Cs general category
// pub surrogate: CodePointSetDataBorrowed<'static>,
// /// The L general category
// pub letter: CodePointSetDataBorrowed<'static>,
// /// The LC general category
// pub cased_letter: CodePointSetDataBorrowed<'static>,
// /// The Ll general category
// pub lowercase_letter: CodePointSetDataBorrowed<'static>,
// /// The Lm general category
// pub modifier_letter: CodePointSetDataBorrowed<'static>,
// /// The Lo general category
// pub other_letter: CodePointSetDataBorrowed<'static>,
// /// The Lt general category
// pub titlecase_letter: CodePointSetDataBorrowed<'static>,
// /// The Lu general category
// pub uppercase_letter: CodePointSetDataBorrowed<'static>,
// /// The M general category
// pub mark: CodePointSetDataBorrowed<'static>,
// /// The Mc general category
// pub spacing_mark: CodePointSetDataBorrowed<'static>,
// /// The Me general category
// pub enclosing_mark: CodePointSetDataBorrowed<'static>,
// /// The Mn general category
// pub nonspacing_mark: CodePointSetDataBorrowed<'static>,
// /// The N general category
// pub number: CodePointSetDataBorrowed<'static>,
// /// The Nd general category
// pub decimal_number: CodePointSetDataBorrowed<'static>,
// /// The Nl general category
// pub letter_number: CodePointSetDataBorrowed<'static>,
// /// The No general category
// pub other_number: CodePointSetDataBorrowed<'static>,
// /// The P general category
// pub punctuation: CodePointSetDataBorrowed<'static>,
// /// The Pc general category
// pub connector_punctuation: CodePointSetDataBorrowed<'static>,
// /// The Pd general category
// pub dash_punctuation: CodePointSetDataBorrowed<'static>,
// /// The Pe general category
// pub close_punctuation: CodePointSetDataBorrowed<'static>,
// /// The Pf general category
// pub final_punctuation: CodePointSetDataBorrowed<'static>,
// /// The Pi general category
// pub initial_punctuation: CodePointSetDataBorrowed<'static>,
// /// The Po general category
// pub other_punctuation: CodePointSetDataBorrowed<'static>,
// /// The Ps general category
// pub open_punctuation: CodePointSetDataBorrowed<'static>,
// /// The S general category
// pub symbol: CodePointSetDataBorrowed<'static>,
// /// The Sc general category
// pub currency_symbol: CodePointSetDataBorrowed<'static>,
// /// The Sk general category
// pub modifier_symbol: CodePointSetDataBorrowed<'static>,
// /// The Sm general category
// pub math_symbol: CodePointSetDataBorrowed<'static>,
// /// The So general category
// pub other_symbol: CodePointSetDataBorrowed<'static>,
// /// The Z general category
// pub separator: CodePointSetDataBorrowed<'static>,
// /// The Zl general category
// pub line_separator: CodePointSetDataBorrowed<'static>,
// /// The Zp general category
// pub paragraph_separator: CodePointSetDataBorrowed<'static>,
// /// The Zs general category
// pub space_separator: CodePointSetDataBorrowed<'static>,
}

pub struct Scripts {
/// Classifier which maps code points to scripts or sets of scripts
pub classifier: ScriptWithExtensionsBorrowed<'static>,
/// Classifier which maps code points to scripts (without extensions)
pub script_classifier: CodePointMapDataBorrowed<'static, Script>,
/// Classifier which maps code points to set of scripts with extensions
pub script_with_extension_classifier: ScriptWithExtensionsBorrowed<'static>,
/// Mapper which maps script name to script enum
pub names: PropertyParserBorrowed<'static, Script>,
}
Expand Down Expand Up @@ -298,7 +224,9 @@ pub static ICU: LazyLock<ICU> = LazyLock::new(|| {
binary_property_static!(XID_START_SET, XidStart);

// Scripts
static SCRIPT_CLASSIFIER: LazyLock<ScriptWithExtensions> =
static SCRIPT_MAP: LazyLock<CodePointMapData<Script>> =
LazyLock::new(|| CodePointMapData::<Script>::try_new_unstable(&BakedDataProvider).unwrap());
static SCRIPT_WITH_EXTENSIONS_CLASSIFIER: LazyLock<ScriptWithExtensions> =
LazyLock::new(|| ScriptWithExtensions::try_new_unstable(&BakedDataProvider).unwrap());
static SCRIPT_NAMES: LazyLock<PropertyParser<Script>> =
LazyLock::new(|| PropertyParser::try_new_unstable(&BakedDataProvider).unwrap());
Expand All @@ -316,7 +244,8 @@ pub static ICU: LazyLock<ICU> = LazyLock::new(|| {
ICU {
general_categories: GeneralCategories { classifier: GENERAL_CATEGORIES_MAP.as_borrowed() },
scripts: Scripts {
classifier: SCRIPT_CLASSIFIER.as_borrowed(),
script_classifier: SCRIPT_MAP.as_borrowed(),
script_with_extension_classifier: SCRIPT_WITH_EXTENSIONS_CLASSIFIER.as_borrowed(),
names: SCRIPT_NAMES.as_borrowed(),
},
properties: Properties {
Expand Down
2 changes: 1 addition & 1 deletion src/js/common/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub type CodeUnit = u16;
pub type CodePoint = u32;

/// Highest unicode code point
const MAX_CODE_POINT: CodePoint = 0x10FFFF;
pub const MAX_CODE_POINT: CodePoint = 0x10FFFF;

// Start of high surrogate range, inclusive
const HIGH_SURROGATE_START: CodeUnit = 0xD800;
Expand Down
Loading

0 comments on commit a5b7658

Please sign in to comment.