@@ -113,6 +113,15 @@ public class WhitespaceLinter {
113113 let userRun = userRunsIterator. next ( ) !
114114 let formattedRun = formattedRunsIterator. next ( ) !
115115
116+ // Print a diagnostic for unexpected Unicode characters with the highest priority.
117+ let unicodeExceptionErrors = checkForUnicodeExceptionErrors ( userIndex: userIndex, userRun: userRun)
118+ guard unicodeExceptionErrors. isEmpty else {
119+ unicodeExceptionErrors. forEach { exception, offset in
120+ diagnose ( . removeUnexpectedUnicode( exception) , category: . unexpectedUnicode, utf8Offset: offset)
121+ }
122+ return
123+ }
124+
116125 // If there was only a single whitespace run in each input, then that means there weren't any
117126 // newlines. Therefore, we're looking at inter-token spacing, unless the whitespace runs
118127 // preceded the first token in the file (i.e., offset == 0), in which case we ignore it here
@@ -126,7 +135,13 @@ public class WhitespaceLinter {
126135
127136 while let userRun = userRunsIterator. next ( ) {
128137 let possibleFormattedRun = formattedRunsIterator. next ( )
129-
138+ let unicodeExceptionErrors = checkForUnicodeExceptionErrors ( userIndex: userIndex, userRun: userRun)
139+ guard unicodeExceptionErrors. isEmpty else {
140+ unicodeExceptionErrors. forEach { exception, offset in
141+ diagnose ( . removeUnexpectedUnicode( exception) , category: . unexpectedUnicode, utf8Offset: offset)
142+ }
143+ continue
144+ }
130145 if runIndex < excessUserLines {
131146 // If there were excess newlines in the user input, tell the user to remove them. This
132147 // short-circuits the trailing whitespace check below; we don't bother telling the user
@@ -323,6 +338,34 @@ public class WhitespaceLinter {
323338 }
324339 }
325340
341+ /// Checks for Unicode exception errors within the given `run` and returns all occurrences with their positions.
342+ ///
343+ /// - Parameters:
344+ /// - userIndex: The current character offset within the user text.
345+ /// - userRun: A run of whitespace from the user text.
346+ /// - Returns: An array of tuples containing each matched `UnicodeException` and its corresponding position.
347+ private func checkForUnicodeExceptionErrors(
348+ userIndex: Int ,
349+ userRun: ArraySlice < UTF8 . CodeUnit >
350+ ) -> [ ( exception: UnicodeException , offset: Int ) ] {
351+ var matches : [ ( UnicodeException , Int ) ] = [ ]
352+ var offset = 0
353+ while offset < userRun. count {
354+ if let exception = UnicodeException . allCases. first ( where: { exception in
355+ let bytes = exception. utf8Bytes
356+ let start = userRun. startIndex + offset
357+ let end = start + bytes. count
358+ return end <= userRun. endIndex && userRun [ start..< end] . elementsEqual ( bytes)
359+ } ) {
360+ matches. append ( ( exception, userIndex + offset) )
361+ offset += exception. utf8Bytes. count
362+ } else {
363+ offset += 1
364+ }
365+ }
366+ return matches
367+ }
368+
326369 /// Find the next non-whitespace character in a given string, and any leading whitespace before
327370 /// the character.
328371 ///
@@ -339,20 +382,26 @@ public class WhitespaceLinter {
339382 startingAt offset: Int ,
340383 in data: [ UTF8 . CodeUnit ]
341384 ) -> ArraySlice < UTF8 . CodeUnit > {
342- func isWhitespace( _ char: UTF8 . CodeUnit ) -> Bool {
343- switch char {
344- case UInt8 ( ascii: " " ) , UInt8 ( ascii: " \n " ) , UInt8 ( ascii: " \t " ) , UInt8 ( ascii: " \r " ) , /*VT*/ 0x0B , /*FF*/ 0x0C :
345- return true
385+ var currentIndex = offset
386+ while currentIndex < data. count {
387+ if let unicodeException = UnicodeException . allCases. first ( where: { exception in
388+ let bytes = exception. utf8Bytes
389+ return currentIndex + bytes. count <= data. count
390+ && data [ currentIndex..< currentIndex + bytes. count] . elementsEqual ( bytes)
391+ } ) {
392+ currentIndex += unicodeException. utf8Bytes. count
393+ continue
394+ }
395+
396+ switch data [ currentIndex] {
397+ case UInt8 ( ascii: " " ) , UInt8 ( ascii: " \n " ) , UInt8 ( ascii: " \t " ) , UInt8 ( ascii: " \r " ) ,
398+ /*VT*/ 0x0B , /*FF*/ 0x0C :
399+ currentIndex += 1
346400 default :
347- return false
401+ return data [ offset ..< currentIndex ]
348402 }
349403 }
350- guard
351- let whitespaceEnd = data [ offset... ] . firstIndex ( where: { !isWhitespace( $0) } )
352- else {
353- return data [ offset..< data. endIndex]
354- }
355- return data [ offset..< whitespaceEnd]
404+ return data [ offset..< currentIndex]
356405 }
357406
358407 /// Returns the code unit at the given index, or nil if the index is the end of the data.
@@ -412,6 +461,22 @@ public class WhitespaceLinter {
412461 }
413462}
414463
464+ /// A collection of unexpected Unicode characters that cannot be processed normally.
465+ private enum UnicodeException : CaseIterable {
466+ case u2028 // U+2028 LINE SEPARATOR
467+ case u2029 // U+2029 PARAGRAPH SEPARATOR
468+
469+ /// Returns the UTF-8 byte sequence corresponding to the Unicode exception.
470+ var utf8Bytes : [ UTF8 . CodeUnit ] {
471+ switch self {
472+ case . u2028:
473+ return [ 0xE2 , 0x80 , 0xA8 ]
474+ case . u2029:
475+ return [ 0xE2 , 0x80 , 0xA9 ]
476+ }
477+ }
478+ }
479+
415480/// Describes the composition of the whitespace that creates an indentation for a line of code.
416481public enum WhitespaceIndentation : Equatable {
417482 /// The line has no preceding whitespace, meaning there's no indentation.
@@ -513,4 +578,8 @@ extension Finding.Message {
513578 }
514579
515580 fileprivate static let lineLengthError : Finding . Message = " line is too long "
581+
582+ fileprivate static func removeUnexpectedUnicode( _ unicode: UnicodeException ) -> Finding . Message {
583+ return " remove unexpected unicode character \\ \( unicode) "
584+ }
516585}
0 commit comments