Handle unprocessable whitespace-related unicode characters

TTOzzi · TTOzzi · commit e27cf406984d · 2025-03-15T00:20:31.000+09:00
diff --git a/Sources/SwiftFormat/PrettyPrint/WhitespaceFindingCategory.swift b/Sources/SwiftFormat/PrettyPrint/WhitespaceFindingCategory.swift
@@ -33,6 +33,9 @@ enum WhitespaceFindingCategory: FindingCategorizing {
   /// Findings related to the length of a line.
   case lineLength
 
+  /// Findings related to the presence of disallowed or unexpected Unicode whitespace characters.
+  case unexpectedUnicode
+
   var description: String {
     switch self {
     case .trailingWhitespace: return "TrailingWhitespace"
@@ -42,6 +45,7 @@ enum WhitespaceFindingCategory: FindingCategorizing {
     case .removeLine: return "RemoveLine"
     case .addLines: return "AddLines"
     case .lineLength: return "LineLength"
+    case .unexpectedUnicode: return "UnexpectedUnicode"
     }
   }
 }
diff --git a/Sources/SwiftFormat/PrettyPrint/WhitespaceLinter.swift b/Sources/SwiftFormat/PrettyPrint/WhitespaceLinter.swift
@@ -113,6 +113,15 @@ public class WhitespaceLinter {
       let userRun = userRunsIterator.next()!
       let formattedRun = formattedRunsIterator.next()!
 
+      // Print a diagnostic for unexpected Unicode characters with the highest priority.
+      let unicodeExceptionErrors = checkForUnicodeExceptionErrors(userIndex: userIndex, userRun: userRun)
+      guard unicodeExceptionErrors.isEmpty else {
+        unicodeExceptionErrors.forEach { exception, offset in
+          diagnose(.removeUnexpectedUnicode(exception), category: .unexpectedUnicode, utf8Offset: offset)
+        }
+        return
+      }
+
       // If there was only a single whitespace run in each input, then that means there weren't any
       // newlines. Therefore, we're looking at inter-token spacing, unless the whitespace runs
       // preceded the first token in the file (i.e., offset == 0), in which case we ignore it here
@@ -126,7 +135,13 @@ public class WhitespaceLinter {
 
       while let userRun = userRunsIterator.next() {
         let possibleFormattedRun = formattedRunsIterator.next()
-
+        let unicodeExceptionErrors = checkForUnicodeExceptionErrors(userIndex: userIndex, userRun: userRun)
+        guard unicodeExceptionErrors.isEmpty else {
+          unicodeExceptionErrors.forEach { exception, offset in
+            diagnose(.removeUnexpectedUnicode(exception), category: .unexpectedUnicode, utf8Offset: offset)
+          }
+          continue
+        }
         if runIndex < excessUserLines {
           // If there were excess newlines in the user input, tell the user to remove them. This
           // short-circuits the trailing whitespace check below; we don't bother telling the user
@@ -323,6 +338,33 @@ public class WhitespaceLinter {
     }
   }
 
+  /// Checks for Unicode exception errors within the given `run` and returns all occurrences with their positions.
+  ///
+  /// - Parameters:
+  ///   - userIndex: The current character offset within the user text.
+  ///   - userRun: A run of whitespace from the user text.
+  /// - Returns: An array of tuples containing each matched `UnicodeException` and its corresponding position.
+  private func checkForUnicodeExceptionErrors(
+    userIndex: Int,
+    userRun: ArraySlice<UTF8.CodeUnit>
+  ) -> [(exception: UnicodeException, offset: Int)] {
+    var matches: [(UnicodeException, Int)] = []
+    var offset = 0
+    while offset < userRun.count {
+      if let exception = UnicodeException.allCases.first(where: { exception in
+        let bytes = exception.utf8Bytes
+        return offset + bytes.count <= userRun.count
+          && userRun.dropFirst(offset).prefix(bytes.count).elementsEqual(bytes)
+      }) {
+        matches.append((exception, userIndex + offset))
+        offset += exception.utf8Bytes.count
+      } else {
+        offset += 1
+      }
+    }
+    return matches
+  }
+
   /// Find the next non-whitespace character in a given string, and any leading whitespace before
   /// the character.
   ///
@@ -339,20 +381,26 @@ public class WhitespaceLinter {
     startingAt offset: Int,
     in data: [UTF8.CodeUnit]
   ) -> ArraySlice<UTF8.CodeUnit> {
-    func isWhitespace(_ char: UTF8.CodeUnit) -> Bool {
-      switch char {
-      case UInt8(ascii: " "), UInt8(ascii: "\n"), UInt8(ascii: "\t"), UInt8(ascii: "\r"), /*VT*/ 0x0B, /*FF*/ 0x0C:
-        return true
+    var currentIndex = offset
+    while currentIndex < data.count {
+      if let unicodeException = UnicodeException.allCases.first(where: { exception in
+        let bytes = exception.utf8Bytes
+        return currentIndex + bytes.count <= data.count
+          && data[currentIndex..<currentIndex + bytes.count].elementsEqual(bytes)
+      }) {
+        currentIndex += unicodeException.utf8Bytes.count
+        continue
+      }
+
+      switch data[currentIndex] {
+      case UInt8(ascii: " "), UInt8(ascii: "\n"), UInt8(ascii: "\t"), UInt8(ascii: "\r"),
+        /*VT*/ 0x0B, /*FF*/ 0x0C:
+        currentIndex += 1
       default:
-        return false
+        return data[offset..<currentIndex]
       }
     }
-    guard
-      let whitespaceEnd = data[offset...].firstIndex(where: { !isWhitespace($0) })
-    else {
-      return data[offset..<data.endIndex]
-    }
-    return data[offset..<whitespaceEnd]
+    return data[offset..<currentIndex]
   }
 
   /// Returns the code unit at the given index, or nil if the index is the end of the data.
@@ -412,6 +460,22 @@ public class WhitespaceLinter {
   }
 }
 
+/// A collection of unexpected Unicode characters that cannot be processed normally.
+private enum UnicodeException: CaseIterable {
+  case u2028  // U+2028 LINE SEPARATOR
+  case u2029  // U+2029 PARAGRAPH SEPARATOR
+
+  /// Returns the UTF-8 byte sequence corresponding to the Unicode exception.
+  var utf8Bytes: [UTF8.CodeUnit] {
+    switch self {
+    case .u2028:
+      return [0xE2, 0x80, 0xA8]
+    case .u2029:
+      return [0xE2, 0x80, 0xA9]
+    }
+  }
+}
+
 /// Describes the composition of the whitespace that creates an indentation for a line of code.
 public enum WhitespaceIndentation: Equatable {
   /// The line has no preceding whitespace, meaning there's no indentation.
@@ -513,4 +577,8 @@ extension Finding.Message {
   }
 
   fileprivate static let lineLengthError: Finding.Message = "line is too long"
+
+  fileprivate static func removeUnexpectedUnicode(_ unicode: UnicodeException) -> Finding.Message {
+    return "remove unexpected unicode character \\\(unicode)"
+  }
 }
diff --git a/Tests/SwiftFormatTests/PrettyPrint/WhitespaceLintTests.swift b/Tests/SwiftFormatTests/PrettyPrint/WhitespaceLintTests.swift
@@ -255,4 +255,29 @@ final class WhitespaceLintTests: WhitespaceTestCase {
       ]
     )
   }
+
+  func testUnexpectedUnicodeCharacters() {
+    assertWhitespaceLint(
+      input: """
+        // Hello World1️⃣\u{2028}
+        // Hello2️⃣\u{2028}World
+        // Hello World3️⃣\u{2028}4️⃣\u{2029}5️⃣\u{2029}
+        // Hello World    6️⃣\u{2028}
+        """,
+      expected: """
+        // Hello World
+        // Hello World
+        // Hello World
+        // Hello World
+        """,
+      findings: [
+        FindingSpec("1️⃣", message: "remove unexpected unicode character \\u2028"),
+        FindingSpec("2️⃣", message: "remove unexpected unicode character \\u2028"),
+        FindingSpec("3️⃣", message: "remove unexpected unicode character \\u2028"),
+        FindingSpec("4️⃣", message: "remove unexpected unicode character \\u2029"),
+        FindingSpec("5️⃣", message: "remove unexpected unicode character \\u2029"),
+        FindingSpec("6️⃣", message: "remove unexpected unicode character \\u2028"),
+      ]
+    )
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,9 @@ enum WhitespaceFindingCategory: FindingCategorizing {`
`33`	`33`	`/// Findings related to the length of a line.`
`34`	`34`	`case lineLength`
`35`	`35`
	`36`	`+ /// Findings related to the presence of disallowed or unexpected Unicode whitespace characters.`
	`37`	`+ case unexpectedUnicode`
	`38`	`+`
`36`	`39`	`var description: String {`
`37`	`40`	`switch self {`
`38`	`41`	`case .trailingWhitespace: return "TrailingWhitespace"`
`@@ -42,6 +45,7 @@ enum WhitespaceFindingCategory: FindingCategorizing {`
`42`	`45`	`case .removeLine: return "RemoveLine"`
`43`	`46`	`case .addLines: return "AddLines"`
`44`	`47`	`case .lineLength: return "LineLength"`
	`48`	`+ case .unexpectedUnicode: return "UnexpectedUnicode"`
`45`	`49`	`}`
`46`	`50`	`}`
`47`	`51`	`}`