Add better support for dealing with supplemental-plane code units (da…

…rt-lang#46)
nex3 · Jun 2, 2022 · c637deb · c637deb
1 parent 1259807
commit c637deb
Show file tree

Hide file tree

Showing 9 changed files with 436 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,17 @@
+## 1.2.0
+
+* Add better support for reading code points in the Unicode supplementary plane:
+
+  * Added `StringScanner.readCodePoint()`, which consumes an entire Unicode code
+    point even if it's represented by two UTF-16 code units.
+
+  * Added `StringScanner.peekCodePoint()`, which returns an entire Unicode code
+    point even if it's represented by two UTF-16 code units.
+
+  * `StringScanner.scanChar()` and `StringScanner.expectChar()` will now
+    properly consume two UTF-16 code units if they're passed Unicode code points
+    in the supplementary plane.
+
 ## 1.1.1
 
 * Populate the pubspec `repository` field.

diff --git a/lib/src/eager_span_scanner.dart b/lib/src/eager_span_scanner.dart
@@ -5,6 +5,7 @@
 import 'charcode.dart';
 import 'line_scanner.dart';
 import 'span_scanner.dart';
+import 'utils.dart';
 
 // TODO(nweiz): Currently this duplicates code in line_scanner.dart. Once
 // sdk#23770 is fully complete, we should move the shared code into a mixin.
@@ -90,7 +91,7 @@ class EagerSpanScanner extends SpanScanner {
       _line += 1;
       _column = 0;
     } else {
-      _column += 1;
+      _column += inSupplementaryPlane(character) ? 2 : 1;
     }
   }
 

diff --git a/lib/src/line_scanner.dart b/lib/src/line_scanner.dart
@@ -4,6 +4,7 @@
 
 import 'charcode.dart';
 import 'string_scanner.dart';
+import 'utils.dart';
 
 // Note that much of this code is duplicated in eager_span_scanner.dart.
 
@@ -95,7 +96,7 @@ class LineScanner extends StringScanner {
       _line += 1;
       _column = 0;
     } else {
-      _column += 1;
+      _column += inSupplementaryPlane(character) ? 2 : 1;
     }
   }
 

diff --git a/lib/src/string_scanner.dart b/lib/src/string_scanner.dart
@@ -90,16 +90,35 @@ class StringScanner {
 
   /// If the next character in the string is [character], consumes it.
   ///
+  /// If [character] is a Unicode code point in a supplementary plane, this will
+  /// consume two code units. Dart's string representation is UTF-16, which
+  /// represents supplementary-plane code units as two code units.
+  ///
   /// Returns whether or not [character] was consumed.
   bool scanChar(int character) {
-    if (isDone) return false;
-    if (string.codeUnitAt(_position) != character) return false;
-    _position++;
-    return true;
+    if (inSupplementaryPlane(character)) {
+      if (_position + 1 >= string.length ||
+          string.codeUnitAt(_position) != highSurrogate(character) ||
+          string.codeUnitAt(_position + 1) != lowSurrogate(character)) {
+        return false;
+      } else {
+        _position += 2;
+        return true;
+      }
+    } else {
+      if (isDone) return false;
+      if (string.codeUnitAt(_position) != character) return false;
+      _position++;
+      return true;
+    }
   }
 
   /// If the next character in the string is [character], consumes it.
   ///
+  /// If [character] is a Unicode code point in a supplementary plane, this will
+  /// consume two code units. Dart's string representation is UTF-16, which
+  /// represents supplementary-plane code units as two code units.
+  ///
   /// If [character] could not be consumed, throws a [FormatException]
   /// describing the position of the failure. [name] is used in this error as
   /// the expected name of the character being matched; if it's `null`, the
@@ -120,6 +139,43 @@ class StringScanner {
     _fail(name);
   }
 
+  /// Consumes a single Unicode code unit and returns it.
+  ///
+  /// This works like [readChar], except that it automatically handles UTF-16
+  /// surrogate pairs. Specifically, if the next two code units form a surrogate
+  /// pair, consumes them both and returns the corresponding Unicode code point.
+  ///
+  /// If next two characters are not a surrogate pair, the next code unit is
+  /// returned as-is, even if it's an unpaired surrogate.
+  int readCodePoint() {
+    final first = readChar();
+    if (!isHighSurrogate(first)) return first;
+
+    final next = peekChar();
+    if (next == null || !isLowSurrogate(next)) return first;
+
+    readChar();
+    return decodeSurrogatePair(first, next);
+  }
+
+  /// Returns the Unicode code point immediately after [position].
+  ///
+  /// This works like [peekChar], except that it automatically handles UTF-16
+  /// surrogate pairs. Specifically, if the next two code units form a surrogate
+  /// pair, returns the corresponding Unicode code point.
+  ///
+  /// If next two characters are not a surrogate pair, the next code unit is
+  /// returned as-is, even if it's an unpaired surrogate.
+  int? peekCodePoint() {
+    final first = peekChar();
+    if (first == null || !isHighSurrogate(first)) return first;
+
+    final next = peekChar(1);
+    if (next == null || !isLowSurrogate(next)) return first;
+
+    return decodeSurrogatePair(first, next);
+  }
+
   /// If [pattern] matches at the current position of the string, scans forward
   /// until the end of the match.
   ///

diff --git a/lib/src/utils.dart b/lib/src/utils.dart
@@ -29,3 +29,67 @@ void validateErrorArgs(
         'the string.');
   }
 }
+
+// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF
+// for documentation on how UTF-16 encoding works and definitions of various
+// related terms.
+
+/// The inclusive lower bound of Unicode's supplementary plane.
+const _supplementaryPlaneLowerBound = 0x10000;
+
+/// The inclusive upper bound of Unicode's supplementary plane.
+const _supplementaryPlaneUpperBound = 0x10FFFF;
+
+/// The inclusive lower bound of the UTF-16 high surrogate block.
+const _highSurrogateLowerBound = 0xD800;
+
+/// The inclusive lower bound of the UTF-16 low surrogate block.
+const _lowSurrogateLowerBound = 0xDC00;
+
+/// The number of low bits in each code unit of a surrogate pair that goes into
+/// determining which code point it encodes.
+const _surrogateBits = 10;
+
+/// A bit mask that covers the lower [_surrogateBits] of a code point, which can
+/// be used to extract the value of a surrogate or the low surrogate value of a
+/// code unit.
+const _surrogateValueMask = (1 << _surrogateBits) - 1;
+
+/// Returns whether [codePoint] is in the Unicode supplementary plane, and thus
+/// must be represented as a surrogate pair in UTF-16.
+bool inSupplementaryPlane(int codePoint) =>
+    codePoint >= _supplementaryPlaneLowerBound &&
+    codePoint <= _supplementaryPlaneUpperBound;
+
+/// Returns whether [codeUnit] is a UTF-16 high surrogate.
+bool isHighSurrogate(int codeUnit) =>
+    (codeUnit & ~_surrogateValueMask) == _highSurrogateLowerBound;
+
+/// Returns whether [codeUnit] is a UTF-16 low surrogate.
+bool isLowSurrogate(int codeUnit) =>
+    (codeUnit >> _surrogateBits) == (_lowSurrogateLowerBound >> _surrogateBits);
+
+/// Returns the high surrogate needed to encode the supplementary-plane
+/// [codePoint].
+int highSurrogate(int codePoint) {
+  assert(inSupplementaryPlane(codePoint));
+  return ((codePoint - _supplementaryPlaneLowerBound) >> _surrogateBits) +
+      _highSurrogateLowerBound;
+}
+
+/// Returns the low surrogate needed to encode the supplementary-plane
+/// [codePoint].
+int lowSurrogate(int codePoint) {
+  assert(inSupplementaryPlane(codePoint));
+  return ((codePoint - _supplementaryPlaneLowerBound) & _surrogateValueMask) +
+      _lowSurrogateLowerBound;
+}
+
+/// Converts a UTF-16 surrogate pair into the Unicode code unit it represents.
+int decodeSurrogatePair(int highSurrogate, int lowSurrogate) {
+  assert(isHighSurrogate(highSurrogate));
+  assert(isLowSurrogate(lowSurrogate));
+  return _supplementaryPlaneLowerBound +
+      (((highSurrogate & _surrogateValueMask) << _surrogateBits) |
+          (lowSurrogate & _surrogateValueMask));
+}
diff --git a/pubspec.yaml b/pubspec.yaml
@@ -1,5 +1,5 @@
 name: string_scanner
-version: 1.1.1
+version: 1.2.0
 description: A class for parsing strings using a sequence of patterns.
 repository: https://github.com/dart-lang/string_scanner
 

diff --git a/test/line_scanner_test.dart b/test/line_scanner_test.dart
@@ -81,6 +81,39 @@ void main() {
     });
   });
 
+  group('readCodePoint()', () {
+    test('on a non-newline character increases the column but not the line',
+        () {
+      scanner.readCodePoint();
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(1));
+    });
+
+    test('consuming a newline resets the column and increases the line', () {
+      scanner.expect('foo');
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(3));
+
+      scanner.readCodePoint();
+      expect(scanner.line, equals(1));
+      expect(scanner.column, equals(0));
+    });
+
+    test("consuming halfway through a CR LF doesn't count as a line", () {
+      scanner.expect('foo\nbar');
+      expect(scanner.line, equals(1));
+      expect(scanner.column, equals(3));
+
+      scanner.readCodePoint();
+      expect(scanner.line, equals(1));
+      expect(scanner.column, equals(4));
+
+      scanner.readCodePoint();
+      expect(scanner.line, equals(2));
+      expect(scanner.column, equals(0));
+    });
+  });
+
   group('scanChar()', () {
     test('on a non-newline character increases the column but not the line',
         () {
@@ -114,6 +147,59 @@ void main() {
     });
   });
 
+  group('before a surrogate pair', () {
+    final codePoint = '\uD83D\uDC6D'.runes.first;
+    const highSurrogate = 0xD83D;
+
+    late LineScanner scanner;
+    setUp(() {
+      scanner = LineScanner('foo: \uD83D\uDC6D');
+      expect(scanner.scan('foo: '), isTrue);
+    });
+
+    test('readChar returns the high surrogate and moves into the pair', () {
+      expect(scanner.readChar(), equals(highSurrogate));
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(6));
+      expect(scanner.position, equals(6));
+    });
+
+    test('readCodePoint returns the code unit and moves past the pair', () {
+      expect(scanner.readCodePoint(), equals(codePoint));
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(7));
+      expect(scanner.position, equals(7));
+    });
+
+    test('scanChar with the high surrogate moves into the pair', () {
+      expect(scanner.scanChar(highSurrogate), isTrue);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(6));
+      expect(scanner.position, equals(6));
+    });
+
+    test('scanChar with the code point moves past the pair', () {
+      expect(scanner.scanChar(codePoint), isTrue);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(7));
+      expect(scanner.position, equals(7));
+    });
+
+    test('expectChar with the high surrogate moves into the pair', () {
+      scanner.expectChar(highSurrogate);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(6));
+      expect(scanner.position, equals(6));
+    });
+
+    test('expectChar with the code point moves past the pair', () {
+      scanner.expectChar(codePoint);
+      expect(scanner.line, equals(0));
+      expect(scanner.column, equals(7));
+      expect(scanner.position, equals(7));
+    });
+  });
+
   group('position=', () {
     test('forward through newlines sets the line and column', () {
       scanner.position = 10; // "foo\nbar\r\nb"

diff --git a/test/span_scanner_test.dart b/test/span_scanner_test.dart
@@ -139,15 +139,6 @@ void testForImplementation(
       expect(span.text, equals('o\nbar\nba'));
     });
 
-    test('.spanFrom() handles surrogate pairs correctly', () {
-      scanner = create('fo\u{12345}o');
-      scanner.scan('fo');
-      final state = scanner.state;
-      scanner.scan('\u{12345}o');
-      final span = scanner.spanFrom(state);
-      expect(span.text, equals('\u{12345}o'));
-    });
-
     test('.emptySpan returns an empty span at the current location', () {
       scanner.scan('foo\nba');
 
@@ -164,5 +155,64 @@ void testForImplementation(
 
       expect(span.text, equals(''));
     });
+
+    group('before a surrogate pair', () {
+      final codePoint = '\uD83D\uDC6D'.runes.first;
+      const highSurrogate = 0xD83D;
+
+      late SpanScanner scanner;
+      setUp(() {
+        scanner = create('foo: \uD83D\uDC6D bar');
+        expect(scanner.scan('foo: '), isTrue);
+      });
+
+      test('readChar returns the high surrogate and moves into the pair', () {
+        expect(scanner.readChar(), equals(highSurrogate));
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(6));
+        expect(scanner.position, equals(6));
+      });
+
+      test('readCodePoint returns the code unit and moves past the pair', () {
+        expect(scanner.readCodePoint(), equals(codePoint));
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(7));
+        expect(scanner.position, equals(7));
+      });
+
+      test('scanChar with the high surrogate moves into the pair', () {
+        expect(scanner.scanChar(highSurrogate), isTrue);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(6));
+        expect(scanner.position, equals(6));
+      });
+
+      test('scanChar with the code point moves past the pair', () {
+        expect(scanner.scanChar(codePoint), isTrue);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(7));
+        expect(scanner.position, equals(7));
+      });
+
+      test('expectChar with the high surrogate moves into the pair', () {
+        scanner.expectChar(highSurrogate);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(6));
+        expect(scanner.position, equals(6));
+      });
+
+      test('expectChar with the code point moves past the pair', () {
+        scanner.expectChar(codePoint);
+        expect(scanner.line, equals(0));
+        expect(scanner.column, equals(7));
+        expect(scanner.position, equals(7));
+      });
+
+      test('spanFrom covers the surrogate pair', () {
+        final state = scanner.state;
+        scanner.scan('\uD83D\uDC6D b');
+        expect(scanner.spanFrom(state).text, equals('\uD83D\uDC6D b'));
+      });
+    });
   });
 }