Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e27cf40

Browse files
committedMar 14, 2025·
Handle unprocessable whitespace-related unicode characters
1 parent 2cb34ec commit e27cf40

File tree

3 files changed

+109
-12
lines changed

3 files changed

+109
-12
lines changed
 

‎Sources/SwiftFormat/PrettyPrint/WhitespaceFindingCategory.swift

+4
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ enum WhitespaceFindingCategory: FindingCategorizing {
3333
/// Findings related to the length of a line.
3434
case lineLength
3535

36+
/// Findings related to the presence of disallowed or unexpected Unicode whitespace characters.
37+
case unexpectedUnicode
38+
3639
var description: String {
3740
switch self {
3841
case .trailingWhitespace: return "TrailingWhitespace"
@@ -42,6 +45,7 @@ enum WhitespaceFindingCategory: FindingCategorizing {
4245
case .removeLine: return "RemoveLine"
4346
case .addLines: return "AddLines"
4447
case .lineLength: return "LineLength"
48+
case .unexpectedUnicode: return "UnexpectedUnicode"
4549
}
4650
}
4751
}

‎Sources/SwiftFormat/PrettyPrint/WhitespaceLinter.swift

+80-12
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,15 @@ public class WhitespaceLinter {
113113
let userRun = userRunsIterator.next()!
114114
let formattedRun = formattedRunsIterator.next()!
115115

116+
// Print a diagnostic for unexpected Unicode characters with the highest priority.
117+
let unicodeExceptionErrors = checkForUnicodeExceptionErrors(userIndex: userIndex, userRun: userRun)
118+
guard unicodeExceptionErrors.isEmpty else {
119+
unicodeExceptionErrors.forEach { exception, offset in
120+
diagnose(.removeUnexpectedUnicode(exception), category: .unexpectedUnicode, utf8Offset: offset)
121+
}
122+
return
123+
}
124+
116125
// If there was only a single whitespace run in each input, then that means there weren't any
117126
// newlines. Therefore, we're looking at inter-token spacing, unless the whitespace runs
118127
// preceded the first token in the file (i.e., offset == 0), in which case we ignore it here
@@ -126,7 +135,13 @@ public class WhitespaceLinter {
126135

127136
while let userRun = userRunsIterator.next() {
128137
let possibleFormattedRun = formattedRunsIterator.next()
129-
138+
let unicodeExceptionErrors = checkForUnicodeExceptionErrors(userIndex: userIndex, userRun: userRun)
139+
guard unicodeExceptionErrors.isEmpty else {
140+
unicodeExceptionErrors.forEach { exception, offset in
141+
diagnose(.removeUnexpectedUnicode(exception), category: .unexpectedUnicode, utf8Offset: offset)
142+
}
143+
continue
144+
}
130145
if runIndex < excessUserLines {
131146
// If there were excess newlines in the user input, tell the user to remove them. This
132147
// short-circuits the trailing whitespace check below; we don't bother telling the user
@@ -323,6 +338,33 @@ public class WhitespaceLinter {
323338
}
324339
}
325340

341+
/// Checks for Unicode exception errors within the given `run` and returns all occurrences with their positions.
342+
///
343+
/// - Parameters:
344+
/// - userIndex: The current character offset within the user text.
345+
/// - userRun: A run of whitespace from the user text.
346+
/// - Returns: An array of tuples containing each matched `UnicodeException` and its corresponding position.
347+
private func checkForUnicodeExceptionErrors(
348+
userIndex: Int,
349+
userRun: ArraySlice<UTF8.CodeUnit>
350+
) -> [(exception: UnicodeException, offset: Int)] {
351+
var matches: [(UnicodeException, Int)] = []
352+
var offset = 0
353+
while offset < userRun.count {
354+
if let exception = UnicodeException.allCases.first(where: { exception in
355+
let bytes = exception.utf8Bytes
356+
return offset + bytes.count <= userRun.count
357+
&& userRun.dropFirst(offset).prefix(bytes.count).elementsEqual(bytes)
358+
}) {
359+
matches.append((exception, userIndex + offset))
360+
offset += exception.utf8Bytes.count
361+
} else {
362+
offset += 1
363+
}
364+
}
365+
return matches
366+
}
367+
326368
/// Find the next non-whitespace character in a given string, and any leading whitespace before
327369
/// the character.
328370
///
@@ -339,20 +381,26 @@ public class WhitespaceLinter {
339381
startingAt offset: Int,
340382
in data: [UTF8.CodeUnit]
341383
) -> ArraySlice<UTF8.CodeUnit> {
342-
func isWhitespace(_ char: UTF8.CodeUnit) -> Bool {
343-
switch char {
344-
case UInt8(ascii: " "), UInt8(ascii: "\n"), UInt8(ascii: "\t"), UInt8(ascii: "\r"), /*VT*/ 0x0B, /*FF*/ 0x0C:
345-
return true
384+
var currentIndex = offset
385+
while currentIndex < data.count {
386+
if let unicodeException = UnicodeException.allCases.first(where: { exception in
387+
let bytes = exception.utf8Bytes
388+
return currentIndex + bytes.count <= data.count
389+
&& data[currentIndex..<currentIndex + bytes.count].elementsEqual(bytes)
390+
}) {
391+
currentIndex += unicodeException.utf8Bytes.count
392+
continue
393+
}
394+
395+
switch data[currentIndex] {
396+
case UInt8(ascii: " "), UInt8(ascii: "\n"), UInt8(ascii: "\t"), UInt8(ascii: "\r"),
397+
/*VT*/ 0x0B, /*FF*/ 0x0C:
398+
currentIndex += 1
346399
default:
347-
return false
400+
return data[offset..<currentIndex]
348401
}
349402
}
350-
guard
351-
let whitespaceEnd = data[offset...].firstIndex(where: { !isWhitespace($0) })
352-
else {
353-
return data[offset..<data.endIndex]
354-
}
355-
return data[offset..<whitespaceEnd]
403+
return data[offset..<currentIndex]
356404
}
357405

358406
/// Returns the code unit at the given index, or nil if the index is the end of the data.
@@ -412,6 +460,22 @@ public class WhitespaceLinter {
412460
}
413461
}
414462

463+
/// A collection of unexpected Unicode characters that cannot be processed normally.
464+
private enum UnicodeException: CaseIterable {
465+
case u2028 // U+2028 LINE SEPARATOR
466+
case u2029 // U+2029 PARAGRAPH SEPARATOR
467+
468+
/// Returns the UTF-8 byte sequence corresponding to the Unicode exception.
469+
var utf8Bytes: [UTF8.CodeUnit] {
470+
switch self {
471+
case .u2028:
472+
return [0xE2, 0x80, 0xA8]
473+
case .u2029:
474+
return [0xE2, 0x80, 0xA9]
475+
}
476+
}
477+
}
478+
415479
/// Describes the composition of the whitespace that creates an indentation for a line of code.
416480
public enum WhitespaceIndentation: Equatable {
417481
/// The line has no preceding whitespace, meaning there's no indentation.
@@ -513,4 +577,8 @@ extension Finding.Message {
513577
}
514578

515579
fileprivate static let lineLengthError: Finding.Message = "line is too long"
580+
581+
fileprivate static func removeUnexpectedUnicode(_ unicode: UnicodeException) -> Finding.Message {
582+
return "remove unexpected unicode character \\\(unicode)"
583+
}
516584
}

‎Tests/SwiftFormatTests/PrettyPrint/WhitespaceLintTests.swift

+25
Original file line numberDiff line numberDiff line change
@@ -255,4 +255,29 @@ final class WhitespaceLintTests: WhitespaceTestCase {
255255
]
256256
)
257257
}
258+
259+
func testUnexpectedUnicodeCharacters() {
260+
assertWhitespaceLint(
261+
input: """
262+
// Hello World1️⃣\u{2028}
263+
// Hello2️⃣\u{2028}World
264+
// Hello World3️⃣\u{2028}4️⃣\u{2029}5️⃣\u{2029}
265+
// Hello World 6️⃣\u{2028}
266+
""",
267+
expected: """
268+
// Hello World
269+
// Hello World
270+
// Hello World
271+
// Hello World
272+
""",
273+
findings: [
274+
FindingSpec("1️⃣", message: "remove unexpected unicode character \\u2028"),
275+
FindingSpec("2️⃣", message: "remove unexpected unicode character \\u2028"),
276+
FindingSpec("3️⃣", message: "remove unexpected unicode character \\u2028"),
277+
FindingSpec("4️⃣", message: "remove unexpected unicode character \\u2029"),
278+
FindingSpec("5️⃣", message: "remove unexpected unicode character \\u2029"),
279+
FindingSpec("6️⃣", message: "remove unexpected unicode character \\u2028"),
280+
]
281+
)
282+
}
258283
}

0 commit comments

Comments
 (0)
Please sign in to comment.