Skip to content

Commit

Permalink
Add String multi-replace via Scanner ⏩ (#227)
Browse files Browse the repository at this point in the history
Sometimes we need to replace multiple characters in a given `String`,
but both `replacingOccurrences()` and `replacingCharacters` operate on
a single `String`/`Character`, requiring multiple passes and thus
becoming very inefficient. One such example is to replace all line
breaking characters in a string into their non line breaking version,
which requires 6 substitution "passes" (space, hyphen, em dash, en
dash, question mark and closing brace).

By using a `Scanner` as a matching mechanism, we can implement
multi-replace in a single pass on the string, greatly improving
efficiency.

## Changes

- Add new
`String.replacingOccurrencesOfCharacters(in:skippingCharactersIn:)`
extension to allow replacing multiple characters in a string in a
single pass.

- Add new `String.nonLineBreaking(newlineCharacterReplacement:)` 
extension to convert a string into a non line breaking version and 
allow tweaking the newline replacement behavior.

- Create new `Character.newlines` helper to contain all `Character`s in
`CharacterSet.newlines`.
  • Loading branch information
p4checo authored Apr 8, 2021
1 parent 45bdc59 commit 408a301
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 1 deletion.
4 changes: 4 additions & 0 deletions Alicerce.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@
0A77982920FCCD24008E269A /* RetryTestCase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A77982820FCCD24008E269A /* RetryTestCase.swift */; };
0A77982F20FFF29D008E269A /* Retry.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A77982E20FFF29D008E269A /* Retry.swift */; };
0A79686120812130005738AF /* LockTestCase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0ACEB2992080F0E5000D95AD /* LockTestCase.swift */; };
0A7ACC852527467B00AA2213 /* Character.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A7ACC842527467B00AA2213 /* Character.swift */; };
0A7B504D20B632FA005A08E7 /* *.alicerce.mindera.com.pem in Resources */ = {isa = PBXBuildFile; fileRef = 0A7B504C20B632FA005A08E7 /* *.alicerce.mindera.com.pem */; };
0A7B505020B6D346005A08E7 /* SecCertificate+PublicKey.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A7B504E20B6D2C4005A08E7 /* SecCertificate+PublicKey.swift */; };
0A7B505220B6D769005A08E7 /* SecCertificate+PublicKeyTestCase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A7B505120B6D769005A08E7 /* SecCertificate+PublicKeyTestCase.swift */; };
Expand Down Expand Up @@ -491,6 +492,7 @@
0A76A004209F854C00D46B63 /* Route+TrieNode_IsEmptyAndDescriptionTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "Route+TrieNode_IsEmptyAndDescriptionTests.swift"; sourceTree = "<group>"; };
0A77982820FCCD24008E269A /* RetryTestCase.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RetryTestCase.swift; sourceTree = "<group>"; };
0A77982E20FFF29D008E269A /* Retry.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Retry.swift; sourceTree = "<group>"; };
0A7ACC842527467B00AA2213 /* Character.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Character.swift; sourceTree = "<group>"; };
0A7B504C20B632FA005A08E7 /* *.alicerce.mindera.com.pem */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "*.alicerce.mindera.com.pem"; sourceTree = "<group>"; };
0A7B504E20B6D2C4005A08E7 /* SecCertificate+PublicKey.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "SecCertificate+PublicKey.swift"; sourceTree = "<group>"; };
0A7B505120B6D769005A08E7 /* SecCertificate+PublicKeyTestCase.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "SecCertificate+PublicKeyTestCase.swift"; sourceTree = "<group>"; };
Expand Down Expand Up @@ -806,6 +808,7 @@
0A3C2C8E1EA7E18500EFB7D4 /* String.swift */,
0A3C2C8F1EA7E18500EFB7D4 /* Thread.swift */,
1B4D4CB61F05016B00FA4260 /* URLRequest.swift */,
0A7ACC842527467B00AA2213 /* Character.swift */,
);
path = Foundation;
sourceTree = "<group>";
Expand Down Expand Up @@ -2011,6 +2014,7 @@
0A3C2DB71EA7E5DD00EFB7D4 /* CollectionReusableView.swift in Sources */,
9D4E3AA1239A6557007F3050 /* CollectionReusableViewSizer.swift in Sources */,
4838FE3123A94CE0007311F0 /* Array+ConstrainableProxy.swift in Sources */,
0A7ACC852527467B00AA2213 /* Character.swift in Sources */,
0A266F201ED374F5009CD0D7 /* AssertDumpsEqual.swift in Sources */,
0ACEB2922080E6D4000D95AD /* Atomic.swift in Sources */,
0A83885E1EB1F6B000C1E835 /* NSPersistentStoreCoordinator+CoreDataStack.swift in Sources */,
Expand Down
14 changes: 14 additions & 0 deletions Sources/Extensions/Foundation/Character.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import Foundation

extension Character {

static let lineSeparator: Character = "\u{2028}"
static let nonBreakingSpace: Character = "\u{00a0}"
static let nonBreakingHyphen: Character = "\u{2011}"
static let wordJoiner: Character = "\u{2060}"
static let emDash: Character = "\u{2013}" // —
static let enDash: Character = "\u{2014}" // –

// from `CharacterSet.newlines`
static let newlines: [Character] = ["\u{A}", "\u{B}", "\u{C}", "\u{D}", "\u{85}", "\u{2028}", "\u{2029}"]
}
97 changes: 97 additions & 0 deletions Sources/Extensions/Foundation/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,100 @@ public extension String {
dump(x, to: &self)
}
}

extension String {

/// Replaces occurrences of multiple `Character`s with corresponding `String` values using the given mapping, while
/// skipping (filtering out) an optional set of characters from the output. Being backed by a `Scanner`, a single
/// pass is made over the receiver.
///
/// - Parameters:
/// - replacementMap: A dictionary containing the replacement mapping `Character` -> `String`.
/// - charactersToBeSkipped: An optional set of characters to skip (i.e. filter out from the input).
/// - Returns: A modified version of the receiver with the replacement mapping applied.
public func replacingOccurrencesOfCharacters(
in replacementMap: [Character: String],
skippingCharactersIn charactersToBeSkipped: CharacterSet? = nil
) -> String {

guard !replacementMap.isEmpty else { return self }

let matchSet = CharacterSet(charactersIn: replacementMap.keys.reduce(into: "") { $0 += String($1) })
.union(charactersToBeSkipped ?? CharacterSet())

var final = ""

let scanner = Scanner(string: self)
scanner.charactersToBeSkipped = charactersToBeSkipped

while !scanner.isAtEnd {

// copy everything until finding a character to be replaced or skipped
var collector: NSString? = ""
if scanner.scanUpToCharacters(from: matchSet, into: &collector), let collector = collector {
final.append(collector as String)
}

// exit early if we're already at the end
guard !scanner.isAtEnd else { break }

// find and replace matching character if needed
replacementMap
.first { match, _ in scanner.scanString(String(match), into: nil) }
.flatMap { _, replacement in final.append(replacement) }
}

return final
}
}

extension String {

public static let nonBreakingSpace = String(Character.nonBreakingSpace)
public static let nonBreakingHyphen = String(Character.nonBreakingHyphen)
public static let wordJoiner = String(Character.wordJoiner)
public static let emDash = String(Character.emDash)
public static let enDash = String(Character.enDash)

/// Returns a non line breaking version of `self`. Line breaking characters occurrences are replaced with
/// corresponding non line breaking variants when existent. Otherwise, word joiner characters are attached to them
/// to make them non line breaking. Existing newlines can be replaced by any given string, via the optional
/// `newlineCharacterReplacement` parameter (defaults to `nil`, which preserves newlines).
///
/// The character mapping is:
/// - space (" ") -> non breaking space (`U+2028`)
/// - hyphen ("-") -> non breaking hyphen (`U+00A0`)
/// - em dash ("—") -> word joiner (`U+2060`) + em dash + word joiner (`U+2060`)
/// - en dash ("–") -> word joiner (`U+2060`) + en dash + word joiner (`U+2060`)
/// - question mark ("?") -> question mark + word joiner (`U+2060`)
/// - closing brace ("}") -> closing brace + word joiner (`U+2060`)
///
/// The `newlineCharacterReplacement` acts upon the characters specified in `CharacterSet.newlines`
/// (`U+000A ~ U+000D`, `U+0085`, `U+2028`, and `U+2029`), some example values are:
/// - `nil` -> newlines are preserved
/// - `""` -> newlines are stripped
/// - `String.nonBreakingSpace` -> output a single line
///
/// - Parameter newlineCharacterReplacement: The replacement string to use for newline characters (defaults to
/// `nil`).
/// - Returns: A modified version of the receiver without line breaking characters.
public func nonLineBreaking(replacingNewlinesWith newlineCharacterReplacement: String? = nil) -> String {

let newlineReplacementMap = newlineCharacterReplacement
.flatMap { replacement in Dictionary(uniqueKeysWithValues: Character.newlines.map { ($0, replacement) }) }
?? [:]

return replacingOccurrencesOfCharacters(
in: [
" ": String.nonBreakingSpace,
"-": String.nonBreakingHyphen,
.emDash: String([.wordJoiner, .emDash, .wordJoiner]),
.enDash: String([.wordJoiner, .enDash, .wordJoiner]),
"?": "?" + .wordJoiner,
"}": "}" + .wordJoiner
]
.merging(newlineReplacementMap) { $1 },
skippingCharactersIn: nil
)
}
}
149 changes: 148 additions & 1 deletion Tests/AlicerceTests/Extensions/Foundation/StringTestCase.swift
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,152 @@ class StringTestCase: XCTestCase {

XCTAssertEqual(intDump, dumpString)
}


// replacingOccurrencesOfCharacters(in:skippingCharactersIn:)

func testReplacingOccurrencesOfCharacters_WithEmptyMap_ShouldReturnSelf() {

let text = "The quick brown fox jumps over the lazy dog"

XCTAssertEqual(text.replacingOccurrencesOfCharacters(in: [:], skippingCharactersIn: nil), text)
}

func testReplacingOccurrencesOfCharacters_WithMatchingCharactersInSingleEntryMapAndNilSkippingCharacterSet_ShouldReplaceOccurrences() {

let original = "The quick brown fox jumps over the lazy dog"
let expected = "The_quick_brown_fox_jumps_over_the_lazy_dog"

XCTAssertEqual(
original.replacingOccurrencesOfCharacters(in: [.init(" "): "_"], skippingCharactersIn: nil),
expected
)
}

func testReplacingOccurrencesOfCharacters_WithMatchingCharactersInMultiEntryMapAndNilSkippingCharacterSet_ShouldReplaceOccurrences() {

let original = "0123456789ABCDEF"
let expected = "0123456789abcdef"

XCTAssertEqual(
original.replacingOccurrencesOfCharacters(
in: [
.init("A"): "a",
.init("B"): "b",
.init("C"): "c",
.init("D"): "d",
.init("E"): "e",
.init("F"): "f",
],
skippingCharactersIn: nil
),
expected
)
}

func testReplacingOccurrencesOfCharacters_WithMatchingCharactersInMapAndMatchingCharactersInSkippingCharacterSet_ShouldReplaceOccurrencesAndSkip() {

let original = "0123456789ABCDEF_0A0B0C0D0E0F0"
let expected = "abcdef_abcdef"

XCTAssertEqual(
original.replacingOccurrencesOfCharacters(
in: [
.init("A"): "a",
.init("B"): "b",
.init("C"): "c",
.init("D"): "d",
.init("E"): "e",
.init("F"): "f",
],
skippingCharactersIn: .decimalDigits
),
expected
)
}

// nonLineBreaking()

func testNonLineBreaking_WithNoLineBreakingCharactersInString_ShouldReturnSelf() {

let original = "0123456789ABCDEF"

XCTAssertEqual(original.nonLineBreaking(), original)
}

func testNonLineBreaking_WithLineBreakingCharactersInString_ShouldReturnANonLineBreakingVersion() {

let original = "The quick-brown\(String.emDash)fox\(String.enDash)jumps?over{the}lazy dog"
let expected =
"""
The\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown\
\(String([.wordJoiner, .emDash, .wordJoiner]))fox\
\(String([.wordJoiner, .enDash, .wordJoiner]))jumps\
?\(String.wordJoiner)over{the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog
"""

XCTAssertEqual(original.nonLineBreaking(), expected)
}

func testNonLineBreaking_WithLineBreakingCharactersAndNewlinesInStringAndNilNewlineReplacement_ShouldReturnANonLineBreakingVersionAndPreserveNewlines() {

let original =
"""
\nThe quick-brown\u{85}\(String.emDash)fox\n\(String.enDash)jumps?\u{2028}\u{2029}over{the}lazy dog\n\
\u{A}.\u{B},\u{C};\u{D}
"""

let expected =
"""
\nThe\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown\u{85}\
\(String([.wordJoiner, .emDash, .wordJoiner]))fox\n\
\(String([.wordJoiner, .enDash, .wordJoiner]))jumps\
?\(String.wordJoiner)\u{2028}\u{2029}over\
{the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog\n\
\u{A}.\u{B},\u{C};\u{D}
"""

XCTAssertEqual(original.nonLineBreaking(replacingNewlinesWith: nil), expected)
}

func testNonLineBreaking_WithLineBreakingCharactersAndNewlinesInStringAndEmptyStringNewlineReplacement_ShouldReturnANonLineBreakingVersionAndReplaceNewlines() {

let original =
"""
\nThe quick-brown\u{85}\(String.emDash)fox\n\(String.enDash)jumps?\u{2028}\u{2029}over{the}lazy dog\n\
\u{A}.\u{B},\u{C};\u{D}
"""

let expected =
"""
The\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown\
\(String([.wordJoiner, .emDash, .wordJoiner]))fox\
\(String([.wordJoiner, .enDash, .wordJoiner]))jumps\
?\(String.wordJoiner)over\
{the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog\
.,;
"""

XCTAssertEqual(original.nonLineBreaking(replacingNewlinesWith: ""), expected)
}

func testNonLineBreaking_WithLineBreakingCharactersAndNewlinesInStringAndNonNilStringNewlineReplacement_ShouldReturnANonLineBreakingVersionAndReplaceNewlines() {

let original =
"""
\nThe quick-brown\u{85}\(String.emDash)fox\n\(String.enDash)jumps?\u{2028}\u{2029}over{the}lazy dog\n\
\u{A}.\u{B},\u{C};\u{D}
"""

let expected =
"""
🦊The\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown🦊\
\(String([.wordJoiner, .emDash, .wordJoiner]))fox🦊\
\(String([.wordJoiner, .enDash, .wordJoiner]))jumps\
?\(String.wordJoiner)🦊🦊over\
{the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog🦊\
🦊.🦊,🦊;🦊
"""

XCTAssertEqual(original.nonLineBreaking(replacingNewlinesWith: "🦊"), expected)
}
}

0 comments on commit 408a301

Please sign in to comment.