-
Notifications
You must be signed in to change notification settings - Fork 51
Handle boundaries when matching in substrings #675
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5b157cf
e694693
9024254
6160220
b331bc0
3cd121a
5114ea4
656c388
0f4f005
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ extension Processor { | |
guard currentPosition < end, let next = input.matchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics | ||
|
@@ -119,35 +120,44 @@ extension Processor { | |
|
||
// MARK: Matching `.` | ||
extension String { | ||
// TODO: Should the below have a `limitedBy` parameter? | ||
|
||
func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? { | ||
guard pos < end else { return nil } | ||
let next = index(pos, offsetBy: 1, limitedBy: end) ?? end | ||
return self[pos..<next].first.map { ($0, next) } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The below seems simpler to understand and is more efficient:
|
||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you run the benchmark suite over this approach vs just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Post-merge performance tweak. We should hand-outline the rare and slow path, no need to pollute the i-cache and confuse the optimizer. |
||
|
||
func matchAnyNonNewline( | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
assert(currentPosition < endIndex) | ||
guard currentPosition < end else { return nil } | ||
if case .definite(let result) = _quickMatchAnyNonNewline( | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics | ||
) { | ||
assert(result == _thoroughMatchAnyNonNewline( | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics)) | ||
return result | ||
} | ||
return _thoroughMatchAnyNonNewline( | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics) | ||
} | ||
|
||
@inline(__always) | ||
private func _quickMatchAnyNonNewline( | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isScalarSemantics: Bool | ||
) -> QuickResult<String.Index?> { | ||
assert(currentPosition < endIndex) | ||
guard currentPosition < end else { return .definite(nil) } | ||
natecook1000 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( | ||
at: currentPosition | ||
at: currentPosition, limitedBy: end | ||
) else { | ||
return .unknown | ||
} | ||
|
@@ -163,44 +173,47 @@ extension String { | |
@inline(never) | ||
private func _thoroughMatchAnyNonNewline( | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
assert(currentPosition < endIndex) | ||
if isScalarSemantics { | ||
guard currentPosition < end else { return nil } | ||
let scalar = unicodeScalars[currentPosition] | ||
guard !scalar.isNewline else { return nil } | ||
return unicodeScalars.index(after: currentPosition) | ||
} | ||
|
||
let char = self[currentPosition] | ||
guard !char.isNewline else { return nil } | ||
return index(after: currentPosition) | ||
guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end), | ||
!char.isNewline | ||
else { return nil } | ||
return next | ||
} | ||
} | ||
|
||
// MARK: - Built-in character class matching | ||
extension String { | ||
// TODO: Should the below have a `limitedBy` parameter? | ||
|
||
// Mentioned in ProgrammersManual.md, update docs if redesigned | ||
func matchBuiltinCC( | ||
_ cc: _CharacterClassModel.Representation, | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isInverted: Bool, | ||
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
assert(currentPosition < endIndex) | ||
guard currentPosition < end else { return nil } | ||
if case .definite(let result) = _quickMatchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics | ||
) { | ||
assert(result == _thoroughMatchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics)) | ||
|
@@ -209,6 +222,7 @@ extension String { | |
return _thoroughMatchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isInverted: isInverted, | ||
isStrictASCII: isStrictASCII, | ||
isScalarSemantics: isScalarSemantics) | ||
|
@@ -219,13 +233,17 @@ extension String { | |
private func _quickMatchBuiltinCC( | ||
_ cc: _CharacterClassModel.Representation, | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isInverted: Bool, | ||
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> QuickResult<String.Index?> { | ||
assert(currentPosition < endIndex) | ||
guard currentPosition < end else { return .definite(nil) } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (similarly) |
||
guard let (next, result) = _quickMatch( | ||
cc, at: currentPosition, isScalarSemantics: isScalarSemantics | ||
cc, | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics | ||
) else { | ||
return .unknown | ||
} | ||
|
@@ -237,27 +255,25 @@ extension String { | |
private func _thoroughMatchBuiltinCC( | ||
_ cc: _CharacterClassModel.Representation, | ||
at currentPosition: String.Index, | ||
limitedBy end: String.Index, | ||
isInverted: Bool, | ||
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
assert(currentPosition < endIndex) | ||
let char = self[currentPosition] | ||
// TODO: Branch here on scalar semantics | ||
// Don't want to pay character cost if unnecessary | ||
guard var (char, next) = | ||
characterAndEnd(at: currentPosition, limitedBy: end) | ||
else { return nil } | ||
let scalar = unicodeScalars[currentPosition] | ||
|
||
let asciiCheck = !isStrictASCII | ||
|| (scalar.isASCII && isScalarSemantics) | ||
|| char.isASCII | ||
|
||
var matched: Bool | ||
var next: String.Index | ||
switch (isScalarSemantics, cc) { | ||
case (_, .anyGrapheme): | ||
next = index(after: currentPosition) | ||
case (true, _): | ||
if isScalarSemantics && cc != .anyGrapheme { | ||
next = unicodeScalars.index(after: currentPosition) | ||
case (false, _): | ||
next = index(after: currentPosition) | ||
} | ||
|
||
switch cc { | ||
|
@@ -285,7 +301,7 @@ extension String { | |
if isScalarSemantics { | ||
matched = scalar.isNewline && asciiCheck | ||
if matched && scalar == "\r" | ||
&& next != endIndex && unicodeScalars[next] == "\n" { | ||
&& next != end && unicodeScalars[next] == "\n" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, is it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should probably be |
||
// Match a full CR-LF sequence even in scalar semantics | ||
unicodeScalars.formIndex(after: &next) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -232,7 +232,7 @@ extension Processor { | |
mutating func match( | ||
_ e: Element, isCaseInsensitive: Bool | ||
) -> Bool { | ||
guard currentPosition < end, let next = input.match( | ||
guard let next = input.match( | ||
e, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -251,7 +251,7 @@ extension Processor { | |
_ seq: Substring, | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard currentPosition < end, let next = input.matchSeq( | ||
guard let next = input.matchSeq( | ||
seq, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -270,7 +270,7 @@ extension Processor { | |
boundaryCheck: Bool, | ||
isCaseInsensitive: Bool | ||
) -> Bool { | ||
guard currentPosition < end, let next = input.matchScalar( | ||
guard let next = input.matchScalar( | ||
s, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -291,7 +291,7 @@ extension Processor { | |
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset, | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard currentPosition < end, let next = input.matchBitset( | ||
guard let next = input.matchBitset( | ||
bitset, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -308,8 +308,9 @@ extension Processor { | |
mutating func matchAnyNonNewline( | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard currentPosition < end, let next = input.matchAnyNonNewline( | ||
guard let next = input.matchAnyNonNewline( | ||
at: currentPosition, | ||
limitedBy: end, | ||
isScalarSemantics: isScalarSemantics | ||
) else { | ||
signalFailure() | ||
|
@@ -536,9 +537,9 @@ extension Processor { | |
|
||
case .consumeBy: | ||
let reg = payload.consumer | ||
let consumer = registers[reg] | ||
guard currentPosition < searchBounds.upperBound, | ||
let nextIndex = registers[reg]( | ||
input, currentPosition..<searchBounds.upperBound), | ||
let nextIndex = consumer(input, currentPosition..<searchBounds.upperBound), | ||
nextIndex <= end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm ok with being extra cautious, especially in the context of 3rd party consumer code. Technically this might mean that they are breaking the contract by not doing their own bounds checking, but that's such an onerous requirement that I think this is better. |
||
else { | ||
signalFailure() | ||
|
@@ -653,20 +654,16 @@ extension String { | |
) -> Index? { | ||
// TODO: This can be greatly sped up with string internals | ||
// TODO: This is also very much quick-check-able | ||
assert(end <= endIndex) | ||
|
||
guard pos < end else { return nil } | ||
guard let (stringChar, next) = characterAndEnd(at: pos, limitedBy: end) | ||
else { return nil } | ||
|
||
if isCaseInsensitive { | ||
guard self[pos].lowercased() == char.lowercased() else { return nil } | ||
guard stringChar.lowercased() == char.lowercased() else { return nil } | ||
} else { | ||
guard self[pos] == char else { return nil } | ||
guard stringChar == char else { return nil } | ||
} | ||
|
||
let idx = index(after: pos) | ||
guard idx <= end else { return nil } | ||
|
||
return idx | ||
return next | ||
} | ||
|
||
func matchSeq( | ||
|
@@ -677,8 +674,6 @@ extension String { | |
) -> Index? { | ||
// TODO: This can be greatly sped up with string internals | ||
// TODO: This is also very much quick-check-able | ||
assert(end <= endIndex) | ||
|
||
var cur = pos | ||
|
||
if isScalarSemantics { | ||
|
@@ -688,8 +683,10 @@ extension String { | |
} | ||
} else { | ||
for e in seq { | ||
guard cur < end, self[cur] == e else { return nil } | ||
self.formIndex(after: &cur) | ||
guard let (char, next) = characterAndEnd(at: cur, limitedBy: end), | ||
char == e | ||
else { return nil } | ||
cur = next | ||
} | ||
} | ||
|
||
|
@@ -706,8 +703,6 @@ extension String { | |
) -> Index? { | ||
// TODO: extremely quick-check-able | ||
// TODO: can be sped up with string internals | ||
assert(end <= endIndex) | ||
|
||
guard pos < end else { return nil } | ||
let curScalar = unicodeScalars[pos] | ||
|
||
|
@@ -720,6 +715,8 @@ extension String { | |
guard curScalar == scalar else { return nil } | ||
} | ||
|
||
// TODO: Is this the correct behavior for a sub-scalar substring end? | ||
// Can a malformed Unicode scalar match anything? | ||
let idx = unicodeScalars.index(after: pos) | ||
guard idx <= end else { return nil } | ||
|
||
|
@@ -738,22 +735,14 @@ extension String { | |
) -> Index? { | ||
// TODO: extremely quick-check-able | ||
// TODO: can be sped up with string internals | ||
assert(end <= endIndex) | ||
|
||
guard pos < end else { return nil } | ||
|
||
let idx: String.Index | ||
if isScalarSemantics { | ||
guard pos < end else { return nil } | ||
guard bitset.matches(unicodeScalars[pos]) else { return nil } | ||
idx = unicodeScalars.index(after: pos) | ||
return unicodeScalars.index(after: pos) | ||
} else { | ||
guard bitset.matches(self[pos]) else { return nil } | ||
idx = index(after: pos) | ||
guard let (char, next) = characterAndEnd(at: pos, limitedBy: end), | ||
bitset.matches(char) else { return nil } | ||
return next | ||
} | ||
|
||
guard idx <= end else { return nil } | ||
return idx | ||
} | ||
|
||
|
||
} |
Uh oh!
There was an error while loading. Please reload this page.