Skip to content

Commit

Permalink
Add better support for dealing with supplemental-plane code units (da…
Browse files Browse the repository at this point in the history
  • Loading branch information
nex3 authored Jun 2, 2022
1 parent 1259807 commit c637deb
Show file tree
Hide file tree
Showing 9 changed files with 436 additions and 16 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
## 1.2.0

* Add better support for reading code points in the Unicode supplementary plane:

* Added `StringScanner.readCodePoint()`, which consumes an entire Unicode code
point even if it's represented by two UTF-16 code units.

* Added `StringScanner.peekCodePoint()`, which returns an entire Unicode code
point even if it's represented by two UTF-16 code units.

* `StringScanner.scanChar()` and `StringScanner.expectChar()` will now
properly consume two UTF-16 code units if they're passed Unicode code points
in the supplementary plane.

## 1.1.1

* Populate the pubspec `repository` field.
Expand Down
3 changes: 2 additions & 1 deletion lib/src/eager_span_scanner.dart
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import 'charcode.dart';
import 'line_scanner.dart';
import 'span_scanner.dart';
import 'utils.dart';

// TODO(nweiz): Currently this duplicates code in line_scanner.dart. Once
// sdk#23770 is fully complete, we should move the shared code into a mixin.
Expand Down Expand Up @@ -90,7 +91,7 @@ class EagerSpanScanner extends SpanScanner {
_line += 1;
_column = 0;
} else {
_column += 1;
_column += inSupplementaryPlane(character) ? 2 : 1;
}
}

Expand Down
3 changes: 2 additions & 1 deletion lib/src/line_scanner.dart
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import 'charcode.dart';
import 'string_scanner.dart';
import 'utils.dart';

// Note that much of this code is duplicated in eager_span_scanner.dart.

Expand Down Expand Up @@ -95,7 +96,7 @@ class LineScanner extends StringScanner {
_line += 1;
_column = 0;
} else {
_column += 1;
_column += inSupplementaryPlane(character) ? 2 : 1;
}
}

Expand Down
64 changes: 60 additions & 4 deletions lib/src/string_scanner.dart
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,35 @@ class StringScanner {

/// If the next character in the string is [character], consumes it.
///
/// If [character] is a Unicode code point in a supplementary plane, this will
/// consume two code units. Dart's string representation is UTF-16, which
/// represents supplementary-plane code units as two code units.
///
/// Returns whether or not [character] was consumed.
bool scanChar(int character) {
if (isDone) return false;
if (string.codeUnitAt(_position) != character) return false;
_position++;
return true;
if (inSupplementaryPlane(character)) {
if (_position + 1 >= string.length ||
string.codeUnitAt(_position) != highSurrogate(character) ||
string.codeUnitAt(_position + 1) != lowSurrogate(character)) {
return false;
} else {
_position += 2;
return true;
}
} else {
if (isDone) return false;
if (string.codeUnitAt(_position) != character) return false;
_position++;
return true;
}
}

/// If the next character in the string is [character], consumes it.
///
/// If [character] is a Unicode code point in a supplementary plane, this will
/// consume two code units. Dart's string representation is UTF-16, which
/// represents supplementary-plane code units as two code units.
///
/// If [character] could not be consumed, throws a [FormatException]
/// describing the position of the failure. [name] is used in this error as
/// the expected name of the character being matched; if it's `null`, the
Expand All @@ -120,6 +139,43 @@ class StringScanner {
_fail(name);
}

/// Consumes a single Unicode code unit and returns it.
///
/// This works like [readChar], except that it automatically handles UTF-16
/// surrogate pairs. Specifically, if the next two code units form a surrogate
/// pair, consumes them both and returns the corresponding Unicode code point.
///
/// If next two characters are not a surrogate pair, the next code unit is
/// returned as-is, even if it's an unpaired surrogate.
int readCodePoint() {
final first = readChar();
if (!isHighSurrogate(first)) return first;

final next = peekChar();
if (next == null || !isLowSurrogate(next)) return first;

readChar();
return decodeSurrogatePair(first, next);
}

/// Returns the Unicode code point immediately after [position].
///
/// This works like [peekChar], except that it automatically handles UTF-16
/// surrogate pairs. Specifically, if the next two code units form a surrogate
/// pair, returns the corresponding Unicode code point.
///
/// If next two characters are not a surrogate pair, the next code unit is
/// returned as-is, even if it's an unpaired surrogate.
int? peekCodePoint() {
final first = peekChar();
if (first == null || !isHighSurrogate(first)) return first;

final next = peekChar(1);
if (next == null || !isLowSurrogate(next)) return first;

return decodeSurrogatePair(first, next);
}

/// If [pattern] matches at the current position of the string, scans forward
/// until the end of the match.
///
Expand Down
64 changes: 64 additions & 0 deletions lib/src/utils.dart
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,67 @@ void validateErrorArgs(
'the string.');
}
}

// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF
// for documentation on how UTF-16 encoding works and definitions of various
// related terms.

/// The inclusive lower bound of Unicode's supplementary plane.
const _supplementaryPlaneLowerBound = 0x10000;

/// The inclusive upper bound of Unicode's supplementary plane.
const _supplementaryPlaneUpperBound = 0x10FFFF;

/// The inclusive lower bound of the UTF-16 high surrogate block.
const _highSurrogateLowerBound = 0xD800;

/// The inclusive lower bound of the UTF-16 low surrogate block.
const _lowSurrogateLowerBound = 0xDC00;

/// The number of low bits in each code unit of a surrogate pair that goes into
/// determining which code point it encodes.
const _surrogateBits = 10;

/// A bit mask that covers the lower [_surrogateBits] of a code point, which can
/// be used to extract the value of a surrogate or the low surrogate value of a
/// code unit.
const _surrogateValueMask = (1 << _surrogateBits) - 1;

/// Returns whether [codePoint] is in the Unicode supplementary plane, and thus
/// must be represented as a surrogate pair in UTF-16.
bool inSupplementaryPlane(int codePoint) =>
codePoint >= _supplementaryPlaneLowerBound &&
codePoint <= _supplementaryPlaneUpperBound;

/// Returns whether [codeUnit] is a UTF-16 high surrogate.
bool isHighSurrogate(int codeUnit) =>
(codeUnit & ~_surrogateValueMask) == _highSurrogateLowerBound;

/// Returns whether [codeUnit] is a UTF-16 low surrogate.
bool isLowSurrogate(int codeUnit) =>
(codeUnit >> _surrogateBits) == (_lowSurrogateLowerBound >> _surrogateBits);

/// Returns the high surrogate needed to encode the supplementary-plane
/// [codePoint].
int highSurrogate(int codePoint) {
assert(inSupplementaryPlane(codePoint));
return ((codePoint - _supplementaryPlaneLowerBound) >> _surrogateBits) +
_highSurrogateLowerBound;
}

/// Returns the low surrogate needed to encode the supplementary-plane
/// [codePoint].
int lowSurrogate(int codePoint) {
assert(inSupplementaryPlane(codePoint));
return ((codePoint - _supplementaryPlaneLowerBound) & _surrogateValueMask) +
_lowSurrogateLowerBound;
}

/// Converts a UTF-16 surrogate pair into the Unicode code unit it represents.
int decodeSurrogatePair(int highSurrogate, int lowSurrogate) {
assert(isHighSurrogate(highSurrogate));
assert(isLowSurrogate(lowSurrogate));
return _supplementaryPlaneLowerBound +
(((highSurrogate & _surrogateValueMask) << _surrogateBits) |
(lowSurrogate & _surrogateValueMask));
}
2 changes: 1 addition & 1 deletion pubspec.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: string_scanner
version: 1.1.1
version: 1.2.0
description: A class for parsing strings using a sequence of patterns.
repository: https://github.com/dart-lang/string_scanner

Expand Down
86 changes: 86 additions & 0 deletions test/line_scanner_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,39 @@ void main() {
});
});

group('readCodePoint()', () {
test('on a non-newline character increases the column but not the line',
() {
scanner.readCodePoint();
expect(scanner.line, equals(0));
expect(scanner.column, equals(1));
});

test('consuming a newline resets the column and increases the line', () {
scanner.expect('foo');
expect(scanner.line, equals(0));
expect(scanner.column, equals(3));

scanner.readCodePoint();
expect(scanner.line, equals(1));
expect(scanner.column, equals(0));
});

test("consuming halfway through a CR LF doesn't count as a line", () {
scanner.expect('foo\nbar');
expect(scanner.line, equals(1));
expect(scanner.column, equals(3));

scanner.readCodePoint();
expect(scanner.line, equals(1));
expect(scanner.column, equals(4));

scanner.readCodePoint();
expect(scanner.line, equals(2));
expect(scanner.column, equals(0));
});
});

group('scanChar()', () {
test('on a non-newline character increases the column but not the line',
() {
Expand Down Expand Up @@ -114,6 +147,59 @@ void main() {
});
});

group('before a surrogate pair', () {
final codePoint = '\uD83D\uDC6D'.runes.first;
const highSurrogate = 0xD83D;

late LineScanner scanner;
setUp(() {
scanner = LineScanner('foo: \uD83D\uDC6D');
expect(scanner.scan('foo: '), isTrue);
});

test('readChar returns the high surrogate and moves into the pair', () {
expect(scanner.readChar(), equals(highSurrogate));
expect(scanner.line, equals(0));
expect(scanner.column, equals(6));
expect(scanner.position, equals(6));
});

test('readCodePoint returns the code unit and moves past the pair', () {
expect(scanner.readCodePoint(), equals(codePoint));
expect(scanner.line, equals(0));
expect(scanner.column, equals(7));
expect(scanner.position, equals(7));
});

test('scanChar with the high surrogate moves into the pair', () {
expect(scanner.scanChar(highSurrogate), isTrue);
expect(scanner.line, equals(0));
expect(scanner.column, equals(6));
expect(scanner.position, equals(6));
});

test('scanChar with the code point moves past the pair', () {
expect(scanner.scanChar(codePoint), isTrue);
expect(scanner.line, equals(0));
expect(scanner.column, equals(7));
expect(scanner.position, equals(7));
});

test('expectChar with the high surrogate moves into the pair', () {
scanner.expectChar(highSurrogate);
expect(scanner.line, equals(0));
expect(scanner.column, equals(6));
expect(scanner.position, equals(6));
});

test('expectChar with the code point moves past the pair', () {
scanner.expectChar(codePoint);
expect(scanner.line, equals(0));
expect(scanner.column, equals(7));
expect(scanner.position, equals(7));
});
});

group('position=', () {
test('forward through newlines sets the line and column', () {
scanner.position = 10; // "foo\nbar\r\nb"
Expand Down
68 changes: 59 additions & 9 deletions test/span_scanner_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,6 @@ void testForImplementation(
expect(span.text, equals('o\nbar\nba'));
});

test('.spanFrom() handles surrogate pairs correctly', () {
scanner = create('fo\u{12345}o');
scanner.scan('fo');
final state = scanner.state;
scanner.scan('\u{12345}o');
final span = scanner.spanFrom(state);
expect(span.text, equals('\u{12345}o'));
});

test('.emptySpan returns an empty span at the current location', () {
scanner.scan('foo\nba');

Expand All @@ -164,5 +155,64 @@ void testForImplementation(

expect(span.text, equals(''));
});

group('before a surrogate pair', () {
final codePoint = '\uD83D\uDC6D'.runes.first;
const highSurrogate = 0xD83D;

late SpanScanner scanner;
setUp(() {
scanner = create('foo: \uD83D\uDC6D bar');
expect(scanner.scan('foo: '), isTrue);
});

test('readChar returns the high surrogate and moves into the pair', () {
expect(scanner.readChar(), equals(highSurrogate));
expect(scanner.line, equals(0));
expect(scanner.column, equals(6));
expect(scanner.position, equals(6));
});

test('readCodePoint returns the code unit and moves past the pair', () {
expect(scanner.readCodePoint(), equals(codePoint));
expect(scanner.line, equals(0));
expect(scanner.column, equals(7));
expect(scanner.position, equals(7));
});

test('scanChar with the high surrogate moves into the pair', () {
expect(scanner.scanChar(highSurrogate), isTrue);
expect(scanner.line, equals(0));
expect(scanner.column, equals(6));
expect(scanner.position, equals(6));
});

test('scanChar with the code point moves past the pair', () {
expect(scanner.scanChar(codePoint), isTrue);
expect(scanner.line, equals(0));
expect(scanner.column, equals(7));
expect(scanner.position, equals(7));
});

test('expectChar with the high surrogate moves into the pair', () {
scanner.expectChar(highSurrogate);
expect(scanner.line, equals(0));
expect(scanner.column, equals(6));
expect(scanner.position, equals(6));
});

test('expectChar with the code point moves past the pair', () {
scanner.expectChar(codePoint);
expect(scanner.line, equals(0));
expect(scanner.column, equals(7));
expect(scanner.position, equals(7));
});

test('spanFrom covers the surrogate pair', () {
final state = scanner.state;
scanner.scan('\uD83D\uDC6D b');
expect(scanner.spanFrom(state).text, equals('\uD83D\uDC6D b'));
});
});
});
}
Loading

0 comments on commit c637deb

Please sign in to comment.