@@ -113,6 +113,15 @@ public class WhitespaceLinter {
113
113
let userRun = userRunsIterator. next ( ) !
114
114
let formattedRun = formattedRunsIterator. next ( ) !
115
115
116
+ // Print a diagnostic for unexpected Unicode characters with the highest priority.
117
+ let unicodeExceptionErrors = checkForUnicodeExceptionErrors ( userIndex: userIndex, userRun: userRun)
118
+ guard unicodeExceptionErrors. isEmpty else {
119
+ unicodeExceptionErrors. forEach { exception, offset in
120
+ diagnose ( . removeUnexpectedUnicode( exception) , category: . unexpectedUnicode, utf8Offset: offset)
121
+ }
122
+ return
123
+ }
124
+
116
125
// If there was only a single whitespace run in each input, then that means there weren't any
117
126
// newlines. Therefore, we're looking at inter-token spacing, unless the whitespace runs
118
127
// preceded the first token in the file (i.e., offset == 0), in which case we ignore it here
@@ -126,7 +135,13 @@ public class WhitespaceLinter {
126
135
127
136
while let userRun = userRunsIterator. next ( ) {
128
137
let possibleFormattedRun = formattedRunsIterator. next ( )
129
-
138
+ let unicodeExceptionErrors = checkForUnicodeExceptionErrors ( userIndex: userIndex, userRun: userRun)
139
+ guard unicodeExceptionErrors. isEmpty else {
140
+ unicodeExceptionErrors. forEach { exception, offset in
141
+ diagnose ( . removeUnexpectedUnicode( exception) , category: . unexpectedUnicode, utf8Offset: offset)
142
+ }
143
+ continue
144
+ }
130
145
if runIndex < excessUserLines {
131
146
// If there were excess newlines in the user input, tell the user to remove them. This
132
147
// short-circuits the trailing whitespace check below; we don't bother telling the user
@@ -323,6 +338,33 @@ public class WhitespaceLinter {
323
338
}
324
339
}
325
340
341
+ /// Checks for Unicode exception errors within the given `run` and returns all occurrences with their positions.
342
+ ///
343
+ /// - Parameters:
344
+ /// - userIndex: The current character offset within the user text.
345
+ /// - userRun: A run of whitespace from the user text.
346
+ /// - Returns: An array of tuples containing each matched `UnicodeException` and its corresponding position.
347
+ private func checkForUnicodeExceptionErrors(
348
+ userIndex: Int ,
349
+ userRun: ArraySlice < UTF8 . CodeUnit >
350
+ ) -> [ ( exception: UnicodeException , offset: Int ) ] {
351
+ var matches : [ ( UnicodeException , Int ) ] = [ ]
352
+ var offset = 0
353
+ while offset < userRun. count {
354
+ if let exception = UnicodeException . allCases. first ( where: { exception in
355
+ let bytes = exception. utf8Bytes
356
+ return offset + bytes. count <= userRun. count
357
+ && userRun. dropFirst ( offset) . prefix ( bytes. count) . elementsEqual ( bytes)
358
+ } ) {
359
+ matches. append ( ( exception, userIndex + offset) )
360
+ offset += exception. utf8Bytes. count
361
+ } else {
362
+ offset += 1
363
+ }
364
+ }
365
+ return matches
366
+ }
367
+
326
368
/// Find the next non-whitespace character in a given string, and any leading whitespace before
327
369
/// the character.
328
370
///
@@ -339,20 +381,26 @@ public class WhitespaceLinter {
339
381
startingAt offset: Int ,
340
382
in data: [ UTF8 . CodeUnit ]
341
383
) -> ArraySlice < UTF8 . CodeUnit > {
342
- func isWhitespace( _ char: UTF8 . CodeUnit ) -> Bool {
343
- switch char {
344
- case UInt8 ( ascii: " " ) , UInt8 ( ascii: " \n " ) , UInt8 ( ascii: " \t " ) , UInt8 ( ascii: " \r " ) , /*VT*/ 0x0B , /*FF*/ 0x0C :
345
- return true
384
+ var currentIndex = offset
385
+ while currentIndex < data. count {
386
+ if let unicodeException = UnicodeException . allCases. first ( where: { exception in
387
+ let bytes = exception. utf8Bytes
388
+ return currentIndex + bytes. count <= data. count
389
+ && data [ currentIndex..< currentIndex + bytes. count] . elementsEqual ( bytes)
390
+ } ) {
391
+ currentIndex += unicodeException. utf8Bytes. count
392
+ continue
393
+ }
394
+
395
+ switch data [ currentIndex] {
396
+ case UInt8 ( ascii: " " ) , UInt8 ( ascii: " \n " ) , UInt8 ( ascii: " \t " ) , UInt8 ( ascii: " \r " ) ,
397
+ /*VT*/ 0x0B , /*FF*/ 0x0C :
398
+ currentIndex += 1
346
399
default :
347
- return false
400
+ return data [ offset ..< currentIndex ]
348
401
}
349
402
}
350
- guard
351
- let whitespaceEnd = data [ offset... ] . firstIndex ( where: { !isWhitespace( $0) } )
352
- else {
353
- return data [ offset..< data. endIndex]
354
- }
355
- return data [ offset..< whitespaceEnd]
403
+ return data [ offset..< currentIndex]
356
404
}
357
405
358
406
/// Returns the code unit at the given index, or nil if the index is the end of the data.
@@ -412,6 +460,22 @@ public class WhitespaceLinter {
412
460
}
413
461
}
414
462
463
+ /// A collection of unexpected Unicode characters that cannot be processed normally.
464
+ private enum UnicodeException : CaseIterable {
465
+ case u2028 // U+2028 LINE SEPARATOR
466
+ case u2029 // U+2029 PARAGRAPH SEPARATOR
467
+
468
+ /// Returns the UTF-8 byte sequence corresponding to the Unicode exception.
469
+ var utf8Bytes : [ UTF8 . CodeUnit ] {
470
+ switch self {
471
+ case . u2028:
472
+ return [ 0xE2 , 0x80 , 0xA8 ]
473
+ case . u2029:
474
+ return [ 0xE2 , 0x80 , 0xA9 ]
475
+ }
476
+ }
477
+ }
478
+
415
479
/// Describes the composition of the whitespace that creates an indentation for a line of code.
416
480
public enum WhitespaceIndentation : Equatable {
417
481
/// The line has no preceding whitespace, meaning there's no indentation.
@@ -513,4 +577,8 @@ extension Finding.Message {
513
577
}
514
578
515
579
fileprivate static let lineLengthError : Finding . Message = " line is too long "
580
+
581
+ fileprivate static func removeUnexpectedUnicode( _ unicode: UnicodeException ) -> Finding . Message {
582
+ return " remove unexpected unicode character \\ \( unicode) "
583
+ }
516
584
}
0 commit comments