From eee2721794d1646ba6fbdd7151c839ff7eb7c0ab Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sun, 10 Dec 2023 11:26:44 -0700 Subject: [PATCH] Add ASCII fast-path ASCII character class matching Uses quickASCIICharacter to speed up ASCII character class matching. 2x speedup for EmailLookahead_All and many, many others. 10% regression in AnchoredNotFound_First and related. --- .../Engine/InstPayload.swift | 4 +- .../_StringProcessing/Engine/MEQuantify.swift | 4 +- .../_StringProcessing/Engine/Processor.swift | 55 +++++++++++++++---- .../Utility/AsciiBitset.swift | 12 +++- 4 files changed, 56 insertions(+), 19 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index d569fcd32..78baf9ce1 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -378,7 +378,7 @@ extension Instruction.Payload { struct QuantifyPayload: RawRepresentable { let rawValue: UInt64 enum PayloadType: UInt64 { - case bitset = 0 + case asciiBitset = 0 case asciiChar = 1 case any = 2 case builtin = 4 @@ -448,7 +448,7 @@ struct QuantifyPayload: RawRepresentable { ) { assert(bitset.bits <= _payloadMask) self.rawValue = bitset.bits - + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .bitset, isScalarSemantics: isScalarSemantics) + + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .asciiBitset, isScalarSemantics: isScalarSemantics) } init( diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 2d187607c..a0480cde6 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -3,8 +3,8 @@ extension Processor { let isScalarSemantics = payload.isScalarSemantics switch payload.type { - case .bitset: - return input.matchBitset( + case .asciiBitset: + return input.matchASCIIBitset( registers[payload.bitset], at: currentPosition, limitedBy: end, diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 6e0a7774c..86365322b 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -291,7 +291,7 @@ extension Processor { _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, isScalarSemantics: Bool ) -> Bool { - guard let next = input.matchBitset( + guard let next = input.matchASCIIBitset( bitset, at: currentPosition, limitedBy: end, @@ -723,22 +723,53 @@ extension String { return idx } - func matchBitset( + func matchASCIIBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, at pos: Index, limitedBy end: Index, isScalarSemantics: Bool ) -> Index? { - // TODO: extremely quick-check-able - // TODO: can be sped up with string internals - if isScalarSemantics { - guard pos < end else { return nil } - guard bitset.matches(unicodeScalars[pos]) else { return nil } - return unicodeScalars.index(after: pos) - } else { - guard let (char, next) = characterAndEnd(at: pos, limitedBy: end), - bitset.matches(char) else { return nil } - return next + + // FIXME: Inversion should be tracked and handled in only one place. + // That is, we should probably store it as a bit in the instruction, so that + // bitset matching and bitset inversion is bit-based rather that semantically + // inverting the notion of a match or not. As-is, we need to track both + // meanings in some code paths. + let isInverted = bitset.isInverted + + // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment + // there + guard let (asciiByte, next, isCRLF) = _quickASCIICharacter( + at: pos, + limitedBy: end + ) else { + if isScalarSemantics { + guard pos < end else { return nil } + guard bitset.matches(unicodeScalars[pos]) else { return nil } + return unicodeScalars.index(after: pos) + } else { + guard let (char, next) = characterAndEnd(at: pos, limitedBy: end), + bitset.matches(char) else { return nil } + return next + } + } + + guard bitset.matches(asciiByte) else { + // FIXME: check inversion here after refactored out of bitset + return nil } + + // CR-LF should only match `[\r]` in scalar semantic mode or if inverted + if isCRLF { + if isScalarSemantics { + return self.unicodeScalars.index(before: next) + } + if isInverted { + return next + } + return nil + } + + return next } } diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift index e063447a0..2b0217cdc 100644 --- a/Sources/_StringProcessing/Utility/AsciiBitset.swift +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -1,3 +1,4 @@ +// TODO: Probably refactor out of DSLTree extension DSLTree.CustomCharacterClass { internal struct AsciiBitset { let isInverted: Bool @@ -49,7 +50,7 @@ extension DSLTree.CustomCharacterClass { } } - private func matches(_ val: UInt8) -> Bool { + private func _matchesWithoutInversionCheck(_ val: UInt8) -> Bool { if val < 64 { return (a >> val) & 1 == 1 } else { @@ -57,10 +58,15 @@ extension DSLTree.CustomCharacterClass { } } + internal func matches(_ byte: UInt8) -> Bool { + guard byte < 128 else { return isInverted } + return _matchesWithoutInversionCheck(byte) == !isInverted + } + internal func matches(_ char: Character) -> Bool { let matched: Bool if let val = char._singleScalarAsciiValue { - matched = matches(val) + matched = _matchesWithoutInversionCheck(val) } else { matched = false } @@ -75,7 +81,7 @@ extension DSLTree.CustomCharacterClass { let matched: Bool if scalar.isASCII { let val = UInt8(ascii: scalar) - matched = matches(val) + matched = _matchesWithoutInversionCheck(val) } else { matched = false }