Skip to content

Commit f052484

Browse files
committed
syntax: refactor and optimize case folding
This rewrites how Unicode simple case folding worked. Instead of just defining a single function and expecting callers to deal with the fallout, we know define a stateful type that "knows" about the structure of the case folding table. For example, it now knows enough to avoid binary search lookups in most cases. All we really have to do is require that callers lookup codepoints in sequence, which is perfectly fine for our use case. Ref #893
1 parent 156cb47 commit f052484

File tree

4 files changed

+125
-112
lines changed

4 files changed

+125
-112
lines changed

regex-syntax/src/hir/interval.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ impl<I: Interval> IntervalSet<I> {
131131

132132
/// Union this set with the given set, in place.
133133
pub fn union(&mut self, other: &IntervalSet<I>) {
134-
if other.ranges.is_empty() {
134+
if other.ranges.is_empty() || self.ranges == other.ranges {
135135
return;
136136
}
137137
// This could almost certainly be done more efficiently.

regex-syntax/src/hir/mod.rs

+3-13
Original file line numberDiff line numberDiff line change
@@ -1232,23 +1232,13 @@ impl Interval for ClassUnicodeRange {
12321232
&self,
12331233
ranges: &mut Vec<ClassUnicodeRange>,
12341234
) -> Result<(), unicode::CaseFoldError> {
1235-
if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1235+
let mut folder = unicode::SimpleCaseFolder::new()?;
1236+
if !folder.overlaps(self.start, self.end) {
12361237
return Ok(());
12371238
}
12381239
let (start, end) = (u32::from(self.start), u32::from(self.end));
1239-
let mut next_simple_cp = None;
12401240
for cp in (start..=end).filter_map(char::from_u32) {
1241-
if next_simple_cp.map_or(false, |next| cp < next) {
1242-
continue;
1243-
}
1244-
let it = match unicode::simple_fold(cp)? {
1245-
Ok(it) => it,
1246-
Err(next) => {
1247-
next_simple_cp = next;
1248-
continue;
1249-
}
1250-
};
1251-
for cp_folded in it {
1241+
for &cp_folded in folder.mapping(cp) {
12521242
ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
12531243
}
12541244
}

regex-syntax/src/hir/translate.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -824,8 +824,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
824824
}
825825
if self.flags().unicode() {
826826
// If case folding won't do anything, then don't bother trying.
827-
let map =
828-
unicode::contains_simple_case_mapping(c, c).map_err(|_| {
827+
let map = unicode::SimpleCaseFolder::new()
828+
.map(|f| f.overlaps(c, c))
829+
.map_err(|_| {
829830
self.error(span, ErrorKind::UnicodeCaseUnavailable)
830831
})?;
831832
if !map {

regex-syntax/src/unicode.rs

+118-96
Original file line numberDiff line numberDiff line change
@@ -64,75 +64,122 @@ impl core::fmt::Display for UnicodeWordError {
6464
}
6565
}
6666

67-
/// Return an iterator over the equivalence class of simple case mappings
68-
/// for the given codepoint. The equivalence class does not include the
69-
/// given codepoint.
70-
///
71-
/// If the equivalence class is empty, then this returns the next scalar
72-
/// value that has a non-empty equivalence class, if it exists. If no such
73-
/// scalar value exists, then `None` is returned. The point of this behavior
74-
/// is to permit callers to avoid calling `simple_fold` more than they need
75-
/// to, since there is some cost to fetching the equivalence class.
76-
///
77-
/// This returns an error if the Unicode case folding tables are not available.
78-
pub fn simple_fold(
79-
c: char,
80-
) -> Result<Result<impl Iterator<Item = char>, Option<char>>, CaseFoldError> {
81-
#[cfg(not(feature = "unicode-case"))]
82-
fn imp(
83-
_: char,
84-
) -> Result<Result<impl Iterator<Item = char>, Option<char>>, CaseFoldError>
85-
{
86-
use core::option::IntoIter;
87-
Err::<core::result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
88-
}
67+
/// A state oriented traverser of the simple case folding table.
68+
///
69+
/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will
70+
/// return an error if the underlying case folding table is unavailable.
71+
///
72+
/// After construction, it is expected that callers will use
73+
/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly
74+
/// increasing order. For example, calling it on `b` and then on `a` is illegal
75+
/// and will result in a panic.
76+
///
77+
/// The main idea of this type is that it tries hard to make mapping lookups
78+
/// fast by exploiting the structure of the underlying table, and the ordering
79+
/// assumption enables this.
80+
#[derive(Debug)]
81+
pub struct SimpleCaseFolder {
82+
/// The simple case fold table. It's a sorted association list, where the
83+
/// keys are Unicode scalar values and the values are the corresponding
84+
/// equivalence class (not including the key) of the "simple" case folded
85+
/// Unicode scalar values.
86+
table: &'static [(char, &'static [char])],
87+
/// The last codepoint that was used for a lookup.
88+
last: Option<char>,
89+
/// The index to the entry in `table` corresponding to the smallest key `k`
90+
/// such that `k > k0`, where `k0` is the most recent key lookup. Note that
91+
/// in particular, `k0` may not be in the table!
92+
next: usize,
93+
}
8994

90-
#[cfg(feature = "unicode-case")]
91-
fn imp(
92-
c: char,
93-
) -> Result<Result<impl Iterator<Item = char>, Option<char>>, CaseFoldError>
94-
{
95-
use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
96-
97-
Ok(CASE_FOLDING_SIMPLE
98-
.binary_search_by_key(&c, |&(c1, _)| c1)
99-
.map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied())
100-
.map_err(|i| {
101-
if i >= CASE_FOLDING_SIMPLE.len() {
102-
None
103-
} else {
104-
Some(CASE_FOLDING_SIMPLE[i].0)
105-
}
106-
}))
95+
impl SimpleCaseFolder {
96+
/// Create a new simple case folder, returning an error if the underlying
97+
/// case folding table is unavailable.
98+
pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> {
99+
#[cfg(not(feature = "unicode-case"))]
100+
{
101+
Err(CaseFoldError(()))
102+
}
103+
#[cfg(feature = "unicode-case")]
104+
{
105+
Ok(SimpleCaseFolder {
106+
table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE,
107+
last: None,
108+
next: 0,
109+
})
110+
}
107111
}
108112

109-
imp(c)
110-
}
111-
112-
/// Returns true if and only if the given (inclusive) range contains at least
113-
/// one Unicode scalar value that has a non-empty non-trivial simple case
114-
/// mapping.
115-
///
116-
/// This function panics if `end < start`.
117-
///
118-
/// This returns an error if the Unicode case folding tables are not available.
119-
pub fn contains_simple_case_mapping(
120-
start: char,
121-
end: char,
122-
) -> Result<bool, CaseFoldError> {
123-
#[cfg(not(feature = "unicode-case"))]
124-
fn imp(_: char, _: char) -> Result<bool, CaseFoldError> {
125-
Err(CaseFoldError(()))
113+
/// Return the equivalence class of case folded codepoints for the given
114+
/// codepoint. The equivalence class returned never includes the codepoint
115+
/// given. If the given codepoint has no case folded codepoints (i.e.,
116+
/// no entry in the underlying case folding table), then this returns an
117+
/// empty slice.
118+
///
119+
/// # Panics
120+
///
121+
/// This panics when called with a `c` that is less than or equal to the
122+
/// previous call. In other words, callers need to use this method with
123+
/// strictly increasing values of `c`.
124+
pub fn mapping(&mut self, c: char) -> &'static [char] {
125+
if let Some(last) = self.last {
126+
assert!(
127+
last < c,
128+
"got codepoint U+{:X} which occurs before \
129+
last codepoint U+{:X}",
130+
u32::from(c),
131+
u32::from(last),
132+
);
133+
}
134+
self.last = Some(c);
135+
if self.next >= self.table.len() {
136+
return &[];
137+
}
138+
let (k, v) = self.table[self.next];
139+
if k == c {
140+
self.next += 1;
141+
return v;
142+
}
143+
match self.get(c) {
144+
Err(i) => {
145+
self.next = i;
146+
&[]
147+
}
148+
Ok(i) => {
149+
// Since we require lookups to proceed
150+
// in order, anything we find should be
151+
// after whatever we thought might be
152+
// next. Otherwise, the caller is either
153+
// going out of order or we would have
154+
// found our next key at 'self.next'.
155+
assert!(i > self.next);
156+
self.next = i + 1;
157+
self.table[i].1
158+
}
159+
}
126160
}
127161

128-
#[cfg(feature = "unicode-case")]
129-
fn imp(start: char, end: char) -> Result<bool, CaseFoldError> {
162+
/// Returns true if and only if the given range overlaps with any region
163+
/// of the underlying case folding table. That is, when true, there exists
164+
/// at least one codepoint in the inclusive range `[start, end]` that has
165+
/// a non-trivial equivalence class of case folded codepoints. Conversely,
166+
/// when this returns false, all codepoints in the range `[start, end]`
167+
/// correspond to the trivial equivalence class of case folded codepoints,
168+
/// i.e., itself.
169+
///
170+
/// This is useful to call before iterating over the codepoints in the
171+
/// range and looking up the mapping for each. If you know none of the
172+
/// mappings will return anything, then you might be able to skip doing it
173+
/// altogether.
174+
///
175+
/// # Panics
176+
///
177+
/// This panics when `end < start`.
178+
pub fn overlaps(&self, start: char, end: char) -> bool {
130179
use core::cmp::Ordering;
131180

132-
use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
133-
134181
assert!(start <= end);
135-
Ok(CASE_FOLDING_SIMPLE
182+
self.table
136183
.binary_search_by(|&(c, _)| {
137184
if start <= c && c <= end {
138185
Ordering::Equal
@@ -142,10 +189,15 @@ pub fn contains_simple_case_mapping(
142189
Ordering::Less
143190
}
144191
})
145-
.is_ok())
192+
.is_ok()
146193
}
147194

148-
imp(start, end)
195+
/// Returns the index at which `c` occurs in the simple case fold table. If
196+
/// `c` does not occur, then this returns an `i` such that `table[i-1].0 <
197+
/// c` and `table[i].0 > c`.
198+
fn get(&self, c: char) -> Result<usize, usize> {
199+
self.table.binary_search_by_key(&c, |&(c1, _)| c1)
200+
}
149201
}
150202

151203
/// A query for finding a character class defined by Unicode. This supports
@@ -897,20 +949,12 @@ mod tests {
897949

898950
#[cfg(feature = "unicode-case")]
899951
fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
900-
simple_fold(c).unwrap().unwrap()
901-
}
902-
903-
#[cfg(feature = "unicode-case")]
904-
fn simple_fold_err(c: char) -> Option<char> {
905-
match simple_fold(c).unwrap() {
906-
Ok(_) => unreachable!("simple_fold returned Ok iterator"),
907-
Err(next) => next,
908-
}
952+
SimpleCaseFolder::new().unwrap().mapping(c).iter().copied()
909953
}
910954

911955
#[cfg(feature = "unicode-case")]
912956
fn contains_case_map(start: char, end: char) -> bool {
913-
contains_simple_case_mapping(start, end).unwrap()
957+
SimpleCaseFolder::new().unwrap().overlaps(start, end)
914958
}
915959

916960
#[test]
@@ -936,26 +980,10 @@ mod tests {
936980
assert_eq!(xs, alloc::vec!['a']);
937981
}
938982

939-
#[test]
940-
#[cfg(feature = "unicode-case")]
941-
fn simple_fold_empty() {
942-
assert_eq!(Some('A'), simple_fold_err('?'));
943-
assert_eq!(Some('A'), simple_fold_err('@'));
944-
assert_eq!(Some('a'), simple_fold_err('['));
945-
assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
946-
}
947-
948-
#[test]
949-
#[cfg(feature = "unicode-case")]
950-
fn simple_fold_max() {
951-
assert_eq!(None, simple_fold_err('\u{10FFFE}'));
952-
assert_eq!(None, simple_fold_err('\u{10FFFF}'));
953-
}
954-
955983
#[test]
956984
#[cfg(not(feature = "unicode-case"))]
957985
fn simple_fold_disabled() {
958-
assert!(simple_fold('a').is_err());
986+
assert!(SimpleCaseFolder::new().is_err());
959987
}
960988

961989
#[test]
@@ -974,12 +1002,6 @@ mod tests {
9741002
assert!(!contains_case_map('☃', '☃'));
9751003
}
9761004

977-
#[test]
978-
#[cfg(not(feature = "unicode-case"))]
979-
fn range_contains_disabled() {
980-
assert!(contains_simple_case_mapping('a', 'a').is_err());
981-
}
982-
9831005
#[test]
9841006
#[cfg(feature = "unicode-gencat")]
9851007
fn regression_466() {

0 commit comments

Comments
 (0)