Skip to content

Commit a7ba701

Browse files
authored
General ascii fast paths for character classes (#644)
General ASCII fast-paths for builtin character classes
1 parent 348e6c3 commit a7ba701

File tree

9 files changed

+395
-184
lines changed

9 files changed

+395
-184
lines changed

Documentation/ProgrammersManual.md

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Programmer's Manual
2+
3+
## Programming patterns
4+
5+
### Engine quick checks and fast paths
6+
7+
In the engine nomenclature, a quick-check results in a yes/no/maybe while a thorough check always results in a definite answer.
8+
9+
The nature of quick checks and fast paths is that they bifurcate testing coverage. One easy way to prevent this in simple cases is to assert that a definite quick result matches the thorough result.
10+
11+
One example of this pattern is matching against a builtin character class. The engine has a `_matchBuiltinCC`
12+
13+
```swift
14+
func _matchBuiltinCC(...) -> Input.Index? {
15+
// Calls _quickMatchBuiltinCC, if that gives a definite result
16+
// asserts that it is the same as the result of
17+
// _thoroughMatchBuiltinCC and returns it. Otherwise returns the
18+
// result of _thoroughMatchBuiltinCC
19+
}
20+
21+
@inline(__always)
22+
func _quickMatchBuiltinCC(...) -> QuickResult<Input.Index?>
23+
24+
@inline(never)
25+
func _thoroughMatchBuiltinCC(...) -> Input.Index?
26+
```
27+
28+
The thorough check is never inlined, as it is a lot of cold code. Note that quick and thorough functions should be pure, that is they shouldn't update processor state.
29+
30+

Sources/_StringProcessing/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ add_library(_StringProcessing
4747
Regex/DSLTree.swift
4848
Regex/Match.swift
4949
Regex/Options.swift
50+
Unicode/ASCII.swift
5051
Unicode/CaseConversion.swift
5152
Unicode/CharacterProps.swift
5253
Unicode/Comparison.swift

Sources/_StringProcessing/Engine/MEBuiltins.swift

+160-99
Original file line numberDiff line numberDiff line change
@@ -9,114 +9,26 @@ extension Character {
99
}
1010

1111
extension Processor {
12-
mutating func matchBuiltin(
12+
mutating func matchBuiltinCC(
1313
_ cc: _CharacterClassModel.Representation,
14-
_ isInverted: Bool,
15-
_ isStrictASCII: Bool,
16-
_ isScalarSemantics: Bool
14+
isInverted: Bool,
15+
isStrictASCII: Bool,
16+
isScalarSemantics: Bool
1717
) -> Bool {
18-
guard let next = _doMatchBuiltin(
18+
guard let next = input._matchBuiltinCC(
1919
cc,
20-
isInverted,
21-
isStrictASCII,
22-
isScalarSemantics
20+
at: currentPosition,
21+
isInverted: isInverted,
22+
isStrictASCII: isStrictASCII,
23+
isScalarSemantics: isScalarSemantics
2324
) else {
2425
signalFailure()
2526
return false
2627
}
2728
currentPosition = next
2829
return true
2930
}
30-
31-
func _doMatchBuiltin(
32-
_ cc: _CharacterClassModel.Representation,
33-
_ isInverted: Bool,
34-
_ isStrictASCII: Bool,
35-
_ isScalarSemantics: Bool
36-
) -> Input.Index? {
37-
guard let char = load(), let scalar = loadScalar() else {
38-
return nil
39-
}
40-
41-
let asciiCheck = !isStrictASCII
42-
|| (scalar.isASCII && isScalarSemantics)
43-
|| char.isASCII
44-
45-
var matched: Bool
46-
var next: Input.Index
47-
switch (isScalarSemantics, cc) {
48-
case (_, .anyGrapheme):
49-
next = input.index(after: currentPosition)
50-
case (_, .anyScalar):
51-
next = input.unicodeScalars.index(after: currentPosition)
52-
case (true, _):
53-
next = input.unicodeScalars.index(after: currentPosition)
54-
case (false, _):
55-
next = input.index(after: currentPosition)
56-
}
57-
58-
switch cc {
59-
case .any, .anyGrapheme:
60-
matched = true
61-
case .anyScalar:
62-
if isScalarSemantics {
63-
matched = true
64-
} else {
65-
matched = input.isOnGraphemeClusterBoundary(next)
66-
}
67-
case .digit:
68-
if isScalarSemantics {
69-
matched = scalar.properties.numericType != nil && asciiCheck
70-
} else {
71-
matched = char.isNumber && asciiCheck
72-
}
73-
case .horizontalWhitespace:
74-
if isScalarSemantics {
75-
matched = scalar.isHorizontalWhitespace && asciiCheck
76-
} else {
77-
matched = char._isHorizontalWhitespace && asciiCheck
78-
}
79-
case .verticalWhitespace:
80-
if isScalarSemantics {
81-
matched = scalar.isNewline && asciiCheck
82-
} else {
83-
matched = char._isNewline && asciiCheck
84-
}
85-
case .newlineSequence:
86-
if isScalarSemantics {
87-
matched = scalar.isNewline && asciiCheck
88-
if matched && scalar == "\r"
89-
&& next != input.endIndex && input.unicodeScalars[next] == "\n" {
90-
// Match a full CR-LF sequence even in scalar semantics
91-
input.unicodeScalars.formIndex(after: &next)
92-
}
93-
} else {
94-
matched = char._isNewline && asciiCheck
95-
}
96-
case .whitespace:
97-
if isScalarSemantics {
98-
matched = scalar.properties.isWhitespace && asciiCheck
99-
} else {
100-
matched = char.isWhitespace && asciiCheck
101-
}
102-
case .word:
103-
if isScalarSemantics {
104-
matched = scalar.properties.isAlphabetic && asciiCheck
105-
} else {
106-
matched = char.isWordCharacter && asciiCheck
107-
}
108-
}
109-
110-
if isInverted {
111-
matched.toggle()
112-
}
11331

114-
guard matched else {
115-
return nil
116-
}
117-
return next
118-
}
119-
12032
func isAtStartOfLine(_ payload: AssertionPayload) -> Bool {
12133
if currentPosition == subjectBounds.lowerBound { return true }
12234
switch payload.semanticLevel {
@@ -126,7 +38,7 @@ extension Processor {
12638
return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline
12739
}
12840
}
129-
41+
13042
func isAtEndOfLine(_ payload: AssertionPayload) -> Bool {
13143
if currentPosition == subjectBounds.upperBound { return true }
13244
switch payload.semanticLevel {
@@ -169,7 +81,7 @@ extension Processor {
16981
return isAtStartOfLine(payload)
17082
case .endOfLine:
17183
return isAtEndOfLine(payload)
172-
84+
17385
case .caretAnchor:
17486
if payload.anchorsMatchNewlines {
17587
return isAtStartOfLine(payload)
@@ -202,3 +114,152 @@ extension Processor {
202114
}
203115
}
204116
}
117+
118+
// MARK: Built-in character class matching
119+
120+
extension String {
121+
122+
// Mentioned in ProgrammersManual.md, update docs if redesigned
123+
func _matchBuiltinCC(
124+
_ cc: _CharacterClassModel.Representation,
125+
at currentPosition: String.Index,
126+
isInverted: Bool,
127+
isStrictASCII: Bool,
128+
isScalarSemantics: Bool
129+
) -> String.Index? {
130+
guard currentPosition < endIndex else {
131+
return nil
132+
}
133+
if case .definite(let result) = _quickMatchBuiltinCC(
134+
cc,
135+
at: currentPosition,
136+
isInverted: isInverted,
137+
isStrictASCII: isStrictASCII,
138+
isScalarSemantics: isScalarSemantics
139+
) {
140+
assert(result == _thoroughMatchBuiltinCC(
141+
cc,
142+
at: currentPosition,
143+
isInverted: isInverted,
144+
isStrictASCII: isStrictASCII,
145+
isScalarSemantics: isScalarSemantics))
146+
return result
147+
}
148+
return _thoroughMatchBuiltinCC(
149+
cc,
150+
at: currentPosition,
151+
isInverted: isInverted,
152+
isStrictASCII: isStrictASCII,
153+
isScalarSemantics: isScalarSemantics)
154+
}
155+
156+
// Mentioned in ProgrammersManual.md, update docs if redesigned
157+
@inline(__always)
158+
func _quickMatchBuiltinCC(
159+
_ cc: _CharacterClassModel.Representation,
160+
at currentPosition: String.Index,
161+
isInverted: Bool,
162+
isStrictASCII: Bool,
163+
isScalarSemantics: Bool
164+
) -> QuickResult<String.Index?> {
165+
assert(currentPosition < endIndex)
166+
guard let (next, result) = _quickMatch(
167+
cc, at: currentPosition, isScalarSemantics: isScalarSemantics
168+
) else {
169+
return .unknown
170+
}
171+
return .definite(result == isInverted ? nil : next)
172+
}
173+
174+
// Mentioned in ProgrammersManual.md, update docs if redesigned
175+
@inline(never)
176+
func _thoroughMatchBuiltinCC(
177+
_ cc: _CharacterClassModel.Representation,
178+
at currentPosition: String.Index,
179+
isInverted: Bool,
180+
isStrictASCII: Bool,
181+
isScalarSemantics: Bool
182+
) -> String.Index? {
183+
assert(currentPosition < endIndex)
184+
let char = self[currentPosition]
185+
let scalar = unicodeScalars[currentPosition]
186+
187+
let asciiCheck = !isStrictASCII
188+
|| (scalar.isASCII && isScalarSemantics)
189+
|| char.isASCII
190+
191+
var matched: Bool
192+
var next: String.Index
193+
switch (isScalarSemantics, cc) {
194+
case (_, .anyGrapheme):
195+
next = index(after: currentPosition)
196+
case (_, .anyScalar):
197+
next = unicodeScalars.index(after: currentPosition)
198+
case (true, _):
199+
next = unicodeScalars.index(after: currentPosition)
200+
case (false, _):
201+
next = index(after: currentPosition)
202+
}
203+
204+
switch cc {
205+
case .any, .anyGrapheme:
206+
matched = true
207+
case .anyScalar:
208+
if isScalarSemantics {
209+
matched = true
210+
} else {
211+
matched = isOnGraphemeClusterBoundary(next)
212+
}
213+
case .digit:
214+
if isScalarSemantics {
215+
matched = scalar.properties.numericType != nil && asciiCheck
216+
} else {
217+
matched = char.isNumber && asciiCheck
218+
}
219+
case .horizontalWhitespace:
220+
if isScalarSemantics {
221+
matched = scalar.isHorizontalWhitespace && asciiCheck
222+
} else {
223+
matched = char._isHorizontalWhitespace && asciiCheck
224+
}
225+
case .verticalWhitespace:
226+
if isScalarSemantics {
227+
matched = scalar.isNewline && asciiCheck
228+
} else {
229+
matched = char._isNewline && asciiCheck
230+
}
231+
case .newlineSequence:
232+
if isScalarSemantics {
233+
matched = scalar.isNewline && asciiCheck
234+
if matched && scalar == "\r"
235+
&& next != endIndex && unicodeScalars[next] == "\n" {
236+
// Match a full CR-LF sequence even in scalar semantics
237+
unicodeScalars.formIndex(after: &next)
238+
}
239+
} else {
240+
matched = char._isNewline && asciiCheck
241+
}
242+
case .whitespace:
243+
if isScalarSemantics {
244+
matched = scalar.properties.isWhitespace && asciiCheck
245+
} else {
246+
matched = char.isWhitespace && asciiCheck
247+
}
248+
case .word:
249+
if isScalarSemantics {
250+
matched = scalar.properties.isAlphabetic && asciiCheck
251+
} else {
252+
matched = char.isWordCharacter && asciiCheck
253+
}
254+
}
255+
256+
if isInverted {
257+
matched.toggle()
258+
}
259+
260+
guard matched else {
261+
return nil
262+
}
263+
return next
264+
}
265+
}

Sources/_StringProcessing/Engine/MEQuantify.swift

+5-4
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@ extension Processor {
99
UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true)
1010
case .builtin:
1111
// We only emit .quantify if it consumes a single character
12-
next = _doMatchBuiltin(
12+
next = input._matchBuiltinCC(
1313
payload.builtin,
14-
payload.builtinIsInverted,
15-
payload.builtinIsStrict,
16-
false)
14+
at: currentPosition,
15+
isInverted: payload.builtinIsInverted,
16+
isStrictASCII: payload.builtinIsStrict,
17+
isScalarSemantics: false)
1718
case .any:
1819
let matched = currentPosition != input.endIndex
1920
&& (!input[currentPosition].isNewline || payload.anyMatchesNewline)

Sources/_StringProcessing/Engine/Processor.swift

+4-4
Original file line numberDiff line numberDiff line change
@@ -583,11 +583,11 @@ extension Processor {
583583

584584
case .matchBuiltin:
585585
let payload = payload.characterClassPayload
586-
if matchBuiltin(
586+
if matchBuiltinCC(
587587
payload.cc,
588-
payload.isInverted,
589-
payload.isStrictASCII,
590-
payload.isScalarSemantics
588+
isInverted: payload.isInverted,
589+
isStrictASCII: payload.isStrictASCII,
590+
isScalarSemantics: payload.isScalarSemantics
591591
) {
592592
controller.step()
593593
}

0 commit comments

Comments
 (0)