Skip to content

Handle boundaries when matching in substrings #675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 11, 2023
16 changes: 9 additions & 7 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,16 @@ extension Character {
switch opts.semanticLevel {
case .graphemeCluster:
return { input, bounds in
let low = bounds.lowerBound
guard let (char, next) = input.characterAndEnd(
at: bounds.lowerBound, limitedBy: bounds.upperBound)
else { return nil }
if isCaseInsensitive && isCased {
return input[low].lowercased() == lowercased()
? input.index(after: low)
return char.lowercased() == self.lowercased()
? next
: nil
} else {
return input[low] == self
? input.index(after: low)
return char == self
? next
: nil
}
}
Expand Down Expand Up @@ -188,7 +190,7 @@ extension DSLTree.Atom.CharacterClass {
func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
let model = asRuntimeModel(opts)
return { input, bounds in
model.matches(in: input, at: bounds.lowerBound)
model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound)
}
}
}
Expand Down Expand Up @@ -517,7 +519,7 @@ extension DSLTree.CustomCharacterClass {
}
if isInverted {
return opts.semanticLevel == .graphemeCluster
? input.index(after: bounds.lowerBound)
? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound)
: input.unicodeScalars.index(after: bounds.lowerBound)
}
return nil
Expand Down
64 changes: 40 additions & 24 deletions Sources/_StringProcessing/Engine/MEBuiltins.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ extension Processor {
guard currentPosition < end, let next = input.matchBuiltinCC(
cc,
at: currentPosition,
limitedBy: end,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics
Expand Down Expand Up @@ -119,35 +120,44 @@ extension Processor {

// MARK: Matching `.`
extension String {
// TODO: Should the below have a `limitedBy` parameter?

func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
guard pos < end else { return nil }
let next = index(pos, offsetBy: 1, limitedBy: end) ?? end
return self[pos..<next].first.map { ($0, next) }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The below seems simpler to understand and is more efficient:

guard pos < end else { return nil }
let next = index(after: pos)
guard next <= end else { return nil }
return (self[pos], next)

}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you run the benchmark suite over this approach vs just (self[idx], self.index(after: idx)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Post-merge performance tweak. We should hand-outline the rare and slow path, no need to pollute the i-cache and confuse the optimizer.


func matchAnyNonNewline(
at currentPosition: String.Index,
limitedBy end: String.Index,
isScalarSemantics: Bool
) -> String.Index? {
assert(currentPosition < endIndex)
guard currentPosition < end else { return nil }
if case .definite(let result) = _quickMatchAnyNonNewline(
at: currentPosition,
limitedBy: end,
isScalarSemantics: isScalarSemantics
) {
assert(result == _thoroughMatchAnyNonNewline(
at: currentPosition,
limitedBy: end,
isScalarSemantics: isScalarSemantics))
return result
}
return _thoroughMatchAnyNonNewline(
at: currentPosition,
limitedBy: end,
isScalarSemantics: isScalarSemantics)
}

@inline(__always)
private func _quickMatchAnyNonNewline(
at currentPosition: String.Index,
limitedBy end: String.Index,
isScalarSemantics: Bool
) -> QuickResult<String.Index?> {
assert(currentPosition < endIndex)
guard currentPosition < end else { return .definite(nil) }
guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
at: currentPosition
at: currentPosition, limitedBy: end
) else {
return .unknown
}
Expand All @@ -163,44 +173,47 @@ extension String {
@inline(never)
private func _thoroughMatchAnyNonNewline(
at currentPosition: String.Index,
limitedBy end: String.Index,
isScalarSemantics: Bool
) -> String.Index? {
assert(currentPosition < endIndex)
if isScalarSemantics {
guard currentPosition < end else { return nil }
let scalar = unicodeScalars[currentPosition]
guard !scalar.isNewline else { return nil }
return unicodeScalars.index(after: currentPosition)
}

let char = self[currentPosition]
guard !char.isNewline else { return nil }
return index(after: currentPosition)
guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end),
!char.isNewline
else { return nil }
return next
}
}

// MARK: - Built-in character class matching
extension String {
// TODO: Should the below have a `limitedBy` parameter?

// Mentioned in ProgrammersManual.md, update docs if redesigned
func matchBuiltinCC(
_ cc: _CharacterClassModel.Representation,
at currentPosition: String.Index,
limitedBy end: String.Index,
isInverted: Bool,
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> String.Index? {
assert(currentPosition < endIndex)
guard currentPosition < end else { return nil }
if case .definite(let result) = _quickMatchBuiltinCC(
cc,
at: currentPosition,
limitedBy: end,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics
) {
assert(result == _thoroughMatchBuiltinCC(
cc,
at: currentPosition,
limitedBy: end,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics))
Expand All @@ -209,6 +222,7 @@ extension String {
return _thoroughMatchBuiltinCC(
cc,
at: currentPosition,
limitedBy: end,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics)
Expand All @@ -219,13 +233,17 @@ extension String {
private func _quickMatchBuiltinCC(
_ cc: _CharacterClassModel.Representation,
at currentPosition: String.Index,
limitedBy end: String.Index,
isInverted: Bool,
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> QuickResult<String.Index?> {
assert(currentPosition < endIndex)
guard currentPosition < end else { return .definite(nil) }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(similarly)

guard let (next, result) = _quickMatch(
cc, at: currentPosition, isScalarSemantics: isScalarSemantics
cc,
at: currentPosition,
limitedBy: end,
isScalarSemantics: isScalarSemantics
) else {
return .unknown
}
Expand All @@ -237,27 +255,25 @@ extension String {
private func _thoroughMatchBuiltinCC(
_ cc: _CharacterClassModel.Representation,
at currentPosition: String.Index,
limitedBy end: String.Index,
isInverted: Bool,
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> String.Index? {
assert(currentPosition < endIndex)
let char = self[currentPosition]
// TODO: Branch here on scalar semantics
// Don't want to pay character cost if unnecessary
guard var (char, next) =
characterAndEnd(at: currentPosition, limitedBy: end)
else { return nil }
let scalar = unicodeScalars[currentPosition]

let asciiCheck = !isStrictASCII
|| (scalar.isASCII && isScalarSemantics)
|| char.isASCII

var matched: Bool
var next: String.Index
switch (isScalarSemantics, cc) {
case (_, .anyGrapheme):
next = index(after: currentPosition)
case (true, _):
if isScalarSemantics && cc != .anyGrapheme {
next = unicodeScalars.index(after: currentPosition)
case (false, _):
next = index(after: currentPosition)
}

switch cc {
Expand Down Expand Up @@ -285,7 +301,7 @@ extension String {
if isScalarSemantics {
matched = scalar.isNewline && asciiCheck
if matched && scalar == "\r"
&& next != endIndex && unicodeScalars[next] == "\n" {
&& next != end && unicodeScalars[next] == "\n" {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, is it != or <?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably be < here

// Match a full CR-LF sequence even in scalar semantics
unicodeScalars.formIndex(after: &next)
}
Expand Down
5 changes: 4 additions & 1 deletion Sources/_StringProcessing/Engine/MEQuantify.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ extension Processor {
return input.matchBuiltinCC(
payload.builtin,
at: currentPosition,
limitedBy: end,
isInverted: payload.builtinIsInverted,
isStrictASCII: payload.builtinIsStrict,
isScalarSemantics: isScalarSemantics)
Expand All @@ -37,7 +38,9 @@ extension Processor {
}

return input.matchAnyNonNewline(
at: currentPosition, isScalarSemantics: isScalarSemantics)
at: currentPosition,
limitedBy: end,
isScalarSemantics: isScalarSemantics)
}
}

Expand Down
59 changes: 24 additions & 35 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ extension Processor {
mutating func match(
_ e: Element, isCaseInsensitive: Bool
) -> Bool {
guard currentPosition < end, let next = input.match(
guard let next = input.match(
e,
at: currentPosition,
limitedBy: end,
Expand All @@ -251,7 +251,7 @@ extension Processor {
_ seq: Substring,
isScalarSemantics: Bool
) -> Bool {
guard currentPosition < end, let next = input.matchSeq(
guard let next = input.matchSeq(
seq,
at: currentPosition,
limitedBy: end,
Expand All @@ -270,7 +270,7 @@ extension Processor {
boundaryCheck: Bool,
isCaseInsensitive: Bool
) -> Bool {
guard currentPosition < end, let next = input.matchScalar(
guard let next = input.matchScalar(
s,
at: currentPosition,
limitedBy: end,
Expand All @@ -291,7 +291,7 @@ extension Processor {
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
isScalarSemantics: Bool
) -> Bool {
guard currentPosition < end, let next = input.matchBitset(
guard let next = input.matchBitset(
bitset,
at: currentPosition,
limitedBy: end,
Expand All @@ -308,8 +308,9 @@ extension Processor {
mutating func matchAnyNonNewline(
isScalarSemantics: Bool
) -> Bool {
guard currentPosition < end, let next = input.matchAnyNonNewline(
guard let next = input.matchAnyNonNewline(
at: currentPosition,
limitedBy: end,
isScalarSemantics: isScalarSemantics
) else {
signalFailure()
Expand Down Expand Up @@ -536,9 +537,9 @@ extension Processor {

case .consumeBy:
let reg = payload.consumer
let consumer = registers[reg]
guard currentPosition < searchBounds.upperBound,
let nextIndex = registers[reg](
input, currentPosition..<searchBounds.upperBound),
let nextIndex = consumer(input, currentPosition..<searchBounds.upperBound),
nextIndex <= end
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm ok with being extra cautious, especially in the context of 3rd party consumer code. Technically this might mean that they are breaking the contract by not doing their own bounds checking, but that's such an onerous requirement that I think this is better.

else {
signalFailure()
Expand Down Expand Up @@ -653,20 +654,16 @@ extension String {
) -> Index? {
// TODO: This can be greatly sped up with string internals
// TODO: This is also very much quick-check-able
assert(end <= endIndex)

guard pos < end else { return nil }
guard let (stringChar, next) = characterAndEnd(at: pos, limitedBy: end)
else { return nil }

if isCaseInsensitive {
guard self[pos].lowercased() == char.lowercased() else { return nil }
guard stringChar.lowercased() == char.lowercased() else { return nil }
} else {
guard self[pos] == char else { return nil }
guard stringChar == char else { return nil }
}

let idx = index(after: pos)
guard idx <= end else { return nil }

return idx
return next
}

func matchSeq(
Expand All @@ -677,8 +674,6 @@ extension String {
) -> Index? {
// TODO: This can be greatly sped up with string internals
// TODO: This is also very much quick-check-able
assert(end <= endIndex)

var cur = pos

if isScalarSemantics {
Expand All @@ -688,8 +683,10 @@ extension String {
}
} else {
for e in seq {
guard cur < end, self[cur] == e else { return nil }
self.formIndex(after: &cur)
guard let (char, next) = characterAndEnd(at: cur, limitedBy: end),
char == e
else { return nil }
cur = next
}
}

Expand All @@ -706,8 +703,6 @@ extension String {
) -> Index? {
// TODO: extremely quick-check-able
// TODO: can be sped up with string internals
assert(end <= endIndex)

guard pos < end else { return nil }
let curScalar = unicodeScalars[pos]

Expand All @@ -720,6 +715,8 @@ extension String {
guard curScalar == scalar else { return nil }
}

// TODO: Is this the correct behavior for a sub-scalar substring end?
// Can a malformed Unicode scalar match anything?
let idx = unicodeScalars.index(after: pos)
guard idx <= end else { return nil }

Expand All @@ -738,22 +735,14 @@ extension String {
) -> Index? {
// TODO: extremely quick-check-able
// TODO: can be sped up with string internals
assert(end <= endIndex)

guard pos < end else { return nil }

let idx: String.Index
if isScalarSemantics {
guard pos < end else { return nil }
guard bitset.matches(unicodeScalars[pos]) else { return nil }
idx = unicodeScalars.index(after: pos)
return unicodeScalars.index(after: pos)
} else {
guard bitset.matches(self[pos]) else { return nil }
idx = index(after: pos)
guard let (char, next) = characterAndEnd(at: pos, limitedBy: end),
bitset.matches(char) else { return nil }
return next
}

guard idx <= end else { return nil }
return idx
}


}
Loading