@@ -64,75 +64,122 @@ impl core::fmt::Display for UnicodeWordError {
64
64
}
65
65
}
66
66
67
- /// Return an iterator over the equivalence class of simple case mappings
68
- /// for the given codepoint. The equivalence class does not include the
69
- /// given codepoint.
70
- ///
71
- /// If the equivalence class is empty, then this returns the next scalar
72
- /// value that has a non-empty equivalence class, if it exists. If no such
73
- /// scalar value exists, then `None` is returned. The point of this behavior
74
- /// is to permit callers to avoid calling `simple_fold` more than they need
75
- /// to, since there is some cost to fetching the equivalence class.
76
- ///
77
- /// This returns an error if the Unicode case folding tables are not available.
78
- pub fn simple_fold (
79
- c : char ,
80
- ) -> Result < Result < impl Iterator < Item = char > , Option < char > > , CaseFoldError > {
81
- #[ cfg( not( feature = "unicode-case" ) ) ]
82
- fn imp (
83
- _: char ,
84
- ) -> Result < Result < impl Iterator < Item = char > , Option < char > > , CaseFoldError >
85
- {
86
- use core:: option:: IntoIter ;
87
- Err :: < core:: result:: Result < IntoIter < char > , _ > , _ > ( CaseFoldError ( ( ) ) )
88
- }
67
+ /// A state oriented traverser of the simple case folding table.
68
+ ///
69
+ /// A case folder can be constructed via `SimpleCaseFolder::new()`, which will
70
+ /// return an error if the underlying case folding table is unavailable.
71
+ ///
72
+ /// After construction, it is expected that callers will use
73
+ /// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly
74
+ /// increasing order. For example, calling it on `b` and then on `a` is illegal
75
+ /// and will result in a panic.
76
+ ///
77
+ /// The main idea of this type is that it tries hard to make mapping lookups
78
+ /// fast by exploiting the structure of the underlying table, and the ordering
79
+ /// assumption enables this.
80
+ #[ derive( Debug ) ]
81
+ pub struct SimpleCaseFolder {
82
+ /// The simple case fold table. It's a sorted association list, where the
83
+ /// keys are Unicode scalar values and the values are the corresponding
84
+ /// equivalence class (not including the key) of the "simple" case folded
85
+ /// Unicode scalar values.
86
+ table : & ' static [ ( char , & ' static [ char ] ) ] ,
87
+ /// The last codepoint that was used for a lookup.
88
+ last : Option < char > ,
89
+ /// The index to the entry in `table` corresponding to the smallest key `k`
90
+ /// such that `k > k0`, where `k0` is the most recent key lookup. Note that
91
+ /// in particular, `k0` may not be in the table!
92
+ next : usize ,
93
+ }
89
94
90
- #[ cfg( feature = "unicode-case" ) ]
91
- fn imp (
92
- c : char ,
93
- ) -> Result < Result < impl Iterator < Item = char > , Option < char > > , CaseFoldError >
94
- {
95
- use crate :: unicode_tables:: case_folding_simple:: CASE_FOLDING_SIMPLE ;
96
-
97
- Ok ( CASE_FOLDING_SIMPLE
98
- . binary_search_by_key ( & c, |& ( c1, _) | c1)
99
- . map ( |i| CASE_FOLDING_SIMPLE [ i] . 1 . iter ( ) . copied ( ) )
100
- . map_err ( |i| {
101
- if i >= CASE_FOLDING_SIMPLE . len ( ) {
102
- None
103
- } else {
104
- Some ( CASE_FOLDING_SIMPLE [ i] . 0 )
105
- }
106
- } ) )
95
+ impl SimpleCaseFolder {
96
+ /// Create a new simple case folder, returning an error if the underlying
97
+ /// case folding table is unavailable.
98
+ pub fn new ( ) -> Result < SimpleCaseFolder , CaseFoldError > {
99
+ #[ cfg( not( feature = "unicode-case" ) ) ]
100
+ {
101
+ Err ( CaseFoldError ( ( ) ) )
102
+ }
103
+ #[ cfg( feature = "unicode-case" ) ]
104
+ {
105
+ Ok ( SimpleCaseFolder {
106
+ table : crate :: unicode_tables:: case_folding_simple:: CASE_FOLDING_SIMPLE ,
107
+ last : None ,
108
+ next : 0 ,
109
+ } )
110
+ }
107
111
}
108
112
109
- imp ( c)
110
- }
111
-
112
- /// Returns true if and only if the given (inclusive) range contains at least
113
- /// one Unicode scalar value that has a non-empty non-trivial simple case
114
- /// mapping.
115
- ///
116
- /// This function panics if `end < start`.
117
- ///
118
- /// This returns an error if the Unicode case folding tables are not available.
119
- pub fn contains_simple_case_mapping (
120
- start : char ,
121
- end : char ,
122
- ) -> Result < bool , CaseFoldError > {
123
- #[ cfg( not( feature = "unicode-case" ) ) ]
124
- fn imp ( _: char , _: char ) -> Result < bool , CaseFoldError > {
125
- Err ( CaseFoldError ( ( ) ) )
113
+ /// Return the equivalence class of case folded codepoints for the given
114
+ /// codepoint. The equivalence class returned never includes the codepoint
115
+ /// given. If the given codepoint has no case folded codepoints (i.e.,
116
+ /// no entry in the underlying case folding table), then this returns an
117
+ /// empty slice.
118
+ ///
119
+ /// # Panics
120
+ ///
121
+ /// This panics when called with a `c` that is less than or equal to the
122
+ /// previous call. In other words, callers need to use this method with
123
+ /// strictly increasing values of `c`.
124
+ pub fn mapping ( & mut self , c : char ) -> & ' static [ char ] {
125
+ if let Some ( last) = self . last {
126
+ assert ! (
127
+ last < c,
128
+ "got codepoint U+{:X} which occurs before \
129
+ last codepoint U+{:X}",
130
+ u32 :: from( c) ,
131
+ u32 :: from( last) ,
132
+ ) ;
133
+ }
134
+ self . last = Some ( c) ;
135
+ if self . next >= self . table . len ( ) {
136
+ return & [ ] ;
137
+ }
138
+ let ( k, v) = self . table [ self . next ] ;
139
+ if k == c {
140
+ self . next += 1 ;
141
+ return v;
142
+ }
143
+ match self . get ( c) {
144
+ Err ( i) => {
145
+ self . next = i;
146
+ & [ ]
147
+ }
148
+ Ok ( i) => {
149
+ // Since we require lookups to proceed
150
+ // in order, anything we find should be
151
+ // after whatever we thought might be
152
+ // next. Otherwise, the caller is either
153
+ // going out of order or we would have
154
+ // found our next key at 'self.next'.
155
+ assert ! ( i > self . next) ;
156
+ self . next = i + 1 ;
157
+ self . table [ i] . 1
158
+ }
159
+ }
126
160
}
127
161
128
- #[ cfg( feature = "unicode-case" ) ]
129
- fn imp ( start : char , end : char ) -> Result < bool , CaseFoldError > {
162
+ /// Returns true if and only if the given range overlaps with any region
163
+ /// of the underlying case folding table. That is, when true, there exists
164
+ /// at least one codepoint in the inclusive range `[start, end]` that has
165
+ /// a non-trivial equivalence class of case folded codepoints. Conversely,
166
+ /// when this returns false, all codepoints in the range `[start, end]`
167
+ /// correspond to the trivial equivalence class of case folded codepoints,
168
+ /// i.e., itself.
169
+ ///
170
+ /// This is useful to call before iterating over the codepoints in the
171
+ /// range and looking up the mapping for each. If you know none of the
172
+ /// mappings will return anything, then you might be able to skip doing it
173
+ /// altogether.
174
+ ///
175
+ /// # Panics
176
+ ///
177
+ /// This panics when `end < start`.
178
+ pub fn overlaps ( & self , start : char , end : char ) -> bool {
130
179
use core:: cmp:: Ordering ;
131
180
132
- use crate :: unicode_tables:: case_folding_simple:: CASE_FOLDING_SIMPLE ;
133
-
134
181
assert ! ( start <= end) ;
135
- Ok ( CASE_FOLDING_SIMPLE
182
+ self . table
136
183
. binary_search_by ( |& ( c, _) | {
137
184
if start <= c && c <= end {
138
185
Ordering :: Equal
@@ -142,10 +189,15 @@ pub fn contains_simple_case_mapping(
142
189
Ordering :: Less
143
190
}
144
191
} )
145
- . is_ok ( ) )
192
+ . is_ok ( )
146
193
}
147
194
148
- imp ( start, end)
195
+ /// Returns the index at which `c` occurs in the simple case fold table. If
196
+ /// `c` does not occur, then this returns an `i` such that `table[i-1].0 <
197
+ /// c` and `table[i].0 > c`.
198
+ fn get ( & self , c : char ) -> Result < usize , usize > {
199
+ self . table . binary_search_by_key ( & c, |& ( c1, _) | c1)
200
+ }
149
201
}
150
202
151
203
/// A query for finding a character class defined by Unicode. This supports
@@ -897,20 +949,12 @@ mod tests {
897
949
898
950
#[ cfg( feature = "unicode-case" ) ]
899
951
fn simple_fold_ok ( c : char ) -> impl Iterator < Item = char > {
900
- simple_fold ( c) . unwrap ( ) . unwrap ( )
901
- }
902
-
903
- #[ cfg( feature = "unicode-case" ) ]
904
- fn simple_fold_err ( c : char ) -> Option < char > {
905
- match simple_fold ( c) . unwrap ( ) {
906
- Ok ( _) => unreachable ! ( "simple_fold returned Ok iterator" ) ,
907
- Err ( next) => next,
908
- }
952
+ SimpleCaseFolder :: new ( ) . unwrap ( ) . mapping ( c) . iter ( ) . copied ( )
909
953
}
910
954
911
955
#[ cfg( feature = "unicode-case" ) ]
912
956
fn contains_case_map ( start : char , end : char ) -> bool {
913
- contains_simple_case_mapping ( start , end ) . unwrap ( )
957
+ SimpleCaseFolder :: new ( ) . unwrap ( ) . overlaps ( start , end )
914
958
}
915
959
916
960
#[ test]
@@ -936,26 +980,10 @@ mod tests {
936
980
assert_eq ! ( xs, alloc:: vec![ 'a' ] ) ;
937
981
}
938
982
939
- #[ test]
940
- #[ cfg( feature = "unicode-case" ) ]
941
- fn simple_fold_empty ( ) {
942
- assert_eq ! ( Some ( 'A' ) , simple_fold_err( '?' ) ) ;
943
- assert_eq ! ( Some ( 'A' ) , simple_fold_err( '@' ) ) ;
944
- assert_eq ! ( Some ( 'a' ) , simple_fold_err( '[' ) ) ;
945
- assert_eq ! ( Some ( 'Ⰰ' ) , simple_fold_err( '☃' ) ) ;
946
- }
947
-
948
- #[ test]
949
- #[ cfg( feature = "unicode-case" ) ]
950
- fn simple_fold_max ( ) {
951
- assert_eq ! ( None , simple_fold_err( '\u{10FFFE}' ) ) ;
952
- assert_eq ! ( None , simple_fold_err( '\u{10FFFF}' ) ) ;
953
- }
954
-
955
983
#[ test]
956
984
#[ cfg( not( feature = "unicode-case" ) ) ]
957
985
fn simple_fold_disabled ( ) {
958
- assert ! ( simple_fold ( 'a' ) . is_err( ) ) ;
986
+ assert ! ( SimpleCaseFolder :: new ( ) . is_err( ) ) ;
959
987
}
960
988
961
989
#[ test]
@@ -974,12 +1002,6 @@ mod tests {
974
1002
assert ! ( !contains_case_map( '☃' , '☃' ) ) ;
975
1003
}
976
1004
977
- #[ test]
978
- #[ cfg( not( feature = "unicode-case" ) ) ]
979
- fn range_contains_disabled ( ) {
980
- assert ! ( contains_simple_case_mapping( 'a' , 'a' ) . is_err( ) ) ;
981
- }
982
-
983
1005
#[ test]
984
1006
#[ cfg( feature = "unicode-gencat" ) ]
985
1007
fn regression_466 ( ) {
0 commit comments