Skip to content

Commit 345f18a

Browse files
snsmacBurntSushi
snsmac
authored andcommitted
syntax: \p{Sc} should map to \p{Currency_Symbol}
'sc' refers to the 'Currency_Symbol' general category, but is also the abbreviation for the 'Script' property. So when going through the canonicalization process, it would get normalized to 'Script' before being checked as a general category. We fix it by special casing it. See also #719 Fixes #835, #899
1 parent 544374b commit 345f18a

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

Diff for: regex-syntax/src/unicode.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,12 @@ impl<'a> ClassQuery<'a> {
243243
// a general category. (Currently, we don't even support the
244244
// 'Case_Folding' property. But if we do in the future, users will be
245245
// required to spell it out.)
246-
if norm != "cf" {
246+
//
247+
// Also 'sc' refers to the 'Currency_Symbol' general category, but is
248+
// also the abbreviation for the 'Script' property. So we avoid calling
249+
// 'canonical_prop' for it too, which would erroneously normalize it
250+
// to 'Script'.
251+
if norm != "cf" && norm != "sc" {
247252
if let Some(canon) = canonical_prop(&norm)? {
248253
return Ok(CanonicalClassQuery::Binary(canon));
249254
}

Diff for: tests/unicode.rs

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
7777
// See: https://github.com/rust-lang/regex/issues/719
7878
mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4)));
7979
mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
80+
mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1)));
8081
mat!(
8182
uni_class_gencat_initial_punctuation,
8283
r"\p{Initial_Punctuation}",

0 commit comments

Comments
 (0)