Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UTF-16 support #109

Merged
merged 20 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e30d741
[utf-16] Create a TextSource trait to be the abstract interface to te…
jfkthame Oct 16, 2023
a9eb4ae
[utf-16] Adapt explicit.rs to use the TextSource trait.
jfkthame Oct 16, 2023
f9affd5
[utf-16] Adapt implicit.rs to use the TextSource trait.
jfkthame Oct 16, 2023
5ba5638
[utf-16] Factor out the processing in InitialInfo to use a TextSource…
jfkthame Oct 16, 2023
29cae27
[utf-16] Refactor BidiInfo to move algorithms into helper functions.
jfkthame Oct 16, 2023
666b5ce
[utf-16] Factor out work from Paragraph into a helper function.
jfkthame Oct 16, 2023
229ec5a
[utf-16] Create a UTF-16 version of InitialInfo.
jfkthame Oct 16, 2023
3a3b1d0
[utf-16] Create a UTF-16 version of BidiInfo.
jfkthame Oct 16, 2023
1f53093
[utf-16] Create a UTF-16 version of Paragraph.
jfkthame Oct 16, 2023
3656603
[utf-16] Duplicate testcases to exercise the UTF-16 APIs.
jfkthame Oct 16, 2023
f04d399
[utf-16] Check the BidiInfoU16 vs BidiInfo results in test_basic_conf…
jfkthame Oct 14, 2023
0bc8c03
[utf-16] Add UTF-16 versions of benches.
jfkthame Oct 11, 2023
6fb622d
[utf-16] Manually implement sealed-trait pattern for TextSource.
jfkthame Oct 29, 2023
005e92e
[utf-16] Explicitly use core::char to resolve compilation failure in …
jfkthame Oct 29, 2023
c3d82dc
[utf-16] Fix tests to be compatible with 1.36.0
jfkthame Oct 30, 2023
74a631e
[utf-16] Don't expose ...U16 types in the base module, just make the …
jfkthame Oct 30, 2023
7866637
[utf-16] Use char_indices() to iterate in identify_bracket_pairs().
jfkthame Oct 30, 2023
69aba46
fixup! [utf-16] Refactor BidiInfo to move algorithms into helper func…
jfkthame Oct 30, 2023
52cc9ee
fixup! [utf-16] Don't expose ...U16 types in the base module, just ma…
jfkthame Oct 30, 2023
ccda52f
[utf-16] Move iterator bounds to the TextSource trait
jfkthame Oct 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions benches/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ extern crate unicode_bidi;
use test::Bencher;

use unicode_bidi::BidiInfo;
use unicode_bidi::utf16::BidiInfo as BidiInfoU16;

fn to_utf16(s: &str) -> Vec<u16> {
s.encode_utf16().collect()
}

const LTR_TEXTS: &[&str] = &["abc\ndef\nghi", "abc 123\ndef 456\nghi 789"];

Expand All @@ -29,6 +34,14 @@ fn bench_bidi_info_new(b: &mut Bencher, texts: &[&str]) {
}
}

fn bench_bidi_info_new_u16(b: &mut Bencher, texts: &Vec<Vec<u16>>) {
for text in texts {
b.iter(|| {
BidiInfoU16::new(&text, None);
});
}
}

fn bench_reorder_line(b: &mut Bencher, texts: &[&str]) {
for text in texts {
let bidi_info = BidiInfo::new(text, None);
Expand All @@ -41,6 +54,18 @@ fn bench_reorder_line(b: &mut Bencher, texts: &[&str]) {
}
}

fn bench_reorder_line_u16(b: &mut Bencher, texts: &Vec<Vec<u16>>) {
for text in texts {
let bidi_info = BidiInfoU16::new(text, None);
b.iter(|| {
for para in &bidi_info.paragraphs {
let line = para.range.clone();
bidi_info.reorder_line(para, line);
}
});
}
}

#[bench]
fn bench_1_bidi_info_new_for_ltr_texts(b: &mut Bencher) {
bench_bidi_info_new(b, LTR_TEXTS);
Expand All @@ -60,3 +85,27 @@ fn bench_3_reorder_line_for_ltr_texts(b: &mut Bencher) {
fn bench_4_reorder_line_for_bidi_texts(b: &mut Bencher) {
bench_reorder_line(b, BIDI_TEXTS);
}

#[bench]
fn bench_5_bidi_info_new_for_ltr_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = LTR_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_bidi_info_new_u16(b, &texts_u16);
}

#[bench]
fn bench_6_bidi_info_new_for_bidi_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = BIDI_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_bidi_info_new_u16(b, &texts_u16);
}

#[bench]
fn bench_7_reorder_line_for_ltr_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = LTR_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_reorder_line_u16(b, &texts_u16);
}

#[bench]
fn bench_8_reorder_line_for_bidi_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = BIDI_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_reorder_line_u16(b, &texts_u16);
}
49 changes: 49 additions & 0 deletions benches/udhr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ extern crate unicode_bidi;
use test::Bencher;

use unicode_bidi::BidiInfo;
use unicode_bidi::utf16::BidiInfo as BidiInfoU16;

fn to_utf16(s: &str) -> Vec<u16> {
s.encode_utf16().collect()
}

const LTR_TEXTS: &[&str] = &[
include_str!("../data/udhr/ltr/udhr_acu_1.txt"),
Expand Down Expand Up @@ -55,6 +60,14 @@ fn bench_bidi_info_new(b: &mut Bencher, texts: &[&str]) {
}
}

fn bench_bidi_info_new_u16(b: &mut Bencher, texts: &Vec<Vec<u16>>) {
for text in texts {
b.iter(|| {
BidiInfoU16::new(&text, None);
});
}
}

fn bench_reorder_line(b: &mut Bencher, texts: &[&str]) {
for text in texts {
let bidi_info = BidiInfo::new(text, None);
Expand All @@ -67,6 +80,18 @@ fn bench_reorder_line(b: &mut Bencher, texts: &[&str]) {
}
}

fn bench_reorder_line_u16(b: &mut Bencher, texts: &Vec<Vec<u16>>) {
for text in texts {
let bidi_info = BidiInfoU16::new(text, None);
b.iter(|| {
for para in &bidi_info.paragraphs {
let line = para.range.clone();
bidi_info.reorder_line(para, line);
}
});
}
}

#[bench]
fn bench_1_bidi_info_new_for_ltr_texts(b: &mut Bencher) {
bench_bidi_info_new(b, LTR_TEXTS);
Expand All @@ -86,3 +111,27 @@ fn bench_3_reorder_line_for_ltr_texts(b: &mut Bencher) {
fn bench_4_reorder_line_for_bidi_texts(b: &mut Bencher) {
bench_reorder_line(b, BIDI_TEXTS);
}

#[bench]
fn bench_5_bidi_info_new_for_ltr_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = LTR_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_bidi_info_new_u16(b, &texts_u16);
}

#[bench]
fn bench_6_bidi_info_new_for_bidi_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = BIDI_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_bidi_info_new_u16(b, &texts_u16);
}

#[bench]
fn bench_7_reorder_line_for_ltr_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = LTR_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_reorder_line_u16(b, &texts_u16);
}

#[bench]
fn bench_8_reorder_line_for_bidi_texts_u16(b: &mut Bencher) {
let texts_u16: Vec<_> = BIDI_TEXTS.iter().map(|t| to_utf16(t)).collect();
bench_reorder_line_u16(b, &texts_u16);
}
13 changes: 8 additions & 5 deletions src/explicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,22 @@ use super::char_data::{
BidiClass::{self, *},
};
use super::level::Level;
use super::TextSource;

/// Compute explicit embedding levels for one paragraph of text (X1-X8).
///
/// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`,
/// for each char in `text`.
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn compute(
text: &str,
pub fn compute<'a, T: TextSource<'a> + ?Sized>(
text: &'a T,
para_level: Level,
original_classes: &[BidiClass],
levels: &mut [Level],
processing_classes: &mut [BidiClass],
) {
) where
<T as TextSource<'a>>::IndexLenIter: Iterator<Item = (usize, usize)>,
{
assert_eq!(text.len(), original_classes.len());

// <http://www.unicode.org/reports/tr9/#X1>
Expand All @@ -41,7 +44,7 @@ pub fn compute(
let mut overflow_embedding_count = 0u32;
let mut valid_isolate_count = 0u32;

for (i, c) in text.char_indices() {
for (i, len) in text.indices_lengths() {
match original_classes[i] {
// Rules X2-X5c
RLE | LRE | RLO | LRO | RLI | LRI | FSI => {
Expand Down Expand Up @@ -167,7 +170,7 @@ pub fn compute(
}

// Handle multi-byte characters.
for j in 1..c.len_utf8() {
for j in 1..len {
levels[i + j] = levels[i];
processing_classes[i + j] = processing_classes[i];
}
Expand Down
65 changes: 30 additions & 35 deletions src/implicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ use core::cmp::max;
use super::char_data::BidiClass::{self, *};
use super::level::Level;
use super::prepare::{not_removed_by_x9, IsolatingRunSequence};
use super::BidiDataSource;
use super::{BidiDataSource, TextSource};

/// 3.3.4 Resolving Weak Types
///
/// <http://www.unicode.org/reports/tr9/#Resolving_Weak_Types>
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn resolve_weak(
text: &str,
pub fn resolve_weak<'a, T: TextSource<'a> + ?Sized>(
text: &'a T,
sequence: &IsolatingRunSequence,
processing_classes: &mut [BidiClass],
) {
Expand Down Expand Up @@ -120,9 +120,9 @@ pub fn resolve_weak(
// See https://github.com/servo/unicode-bidi/issues/86 for improving this.
// We want to make sure we check the correct next character by skipping past the rest
// of this one.
if let Some(ch) = text.get(i..).and_then(|s| s.chars().next()) {
if let Some((_, char_len)) = text.char_at(i) {
let mut next_class = sequence
.iter_forwards_from(i + ch.len_utf8(), run_index)
.iter_forwards_from(i + char_len, run_index)
.map(|j| processing_classes[j])
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
.find(not_removed_by_x9)
Expand Down Expand Up @@ -156,7 +156,7 @@ pub fn resolve_weak(
}
*class = ON;
}
for idx in sequence.iter_forwards_from(i + ch.len_utf8(), run_index) {
for idx in sequence.iter_forwards_from(i + char_len, run_index) {
let class = &mut processing_classes[idx];
if *class != BN {
break;
Expand Down Expand Up @@ -248,14 +248,17 @@ pub fn resolve_weak(
///
/// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn resolve_neutral<D: BidiDataSource>(
text: &str,
pub fn resolve_neutral<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
text: &'a T,
data_source: &D,
sequence: &IsolatingRunSequence,
levels: &[Level],
original_classes: &[BidiClass],
processing_classes: &mut [BidiClass],
) {
) where
<T as TextSource<'a>>::CharIndexIter: Iterator<Item = (usize, char)>,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: all of these bounds should live on the TextSource trait, not on all these where clauses

<T as TextSource<'a>>::CharIter: Iterator<Item = char>,
{
// e = embedding direction
let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
let not_e = if e == BidiClass::L {
Expand Down Expand Up @@ -288,12 +291,13 @@ pub fn resolve_neutral<D: BidiDataSource>(
let mut found_not_e = false;
let mut class_to_set = None;

let start_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8();
let start_char_len =
T::char_len(text.subrange(pair.start..pair.end).chars().next().unwrap());
// > Inspect the bidirectional types of the characters enclosed within the bracket pair.
//
// `pair` is [start, end) so we will end up processing the opening character but not the closing one.
//
for enclosed_i in sequence.iter_forwards_from(pair.start + start_len_utf8, pair.start_run) {
for enclosed_i in sequence.iter_forwards_from(pair.start + start_char_len, pair.start_run) {
if enclosed_i >= pair.end {
#[cfg(feature = "std")]
debug_assert!(
Expand Down Expand Up @@ -362,11 +366,12 @@ pub fn resolve_neutral<D: BidiDataSource>(
if let Some(class_to_set) = class_to_set {
// Update all processing classes corresponding to the start and end elements, as requested.
// We should include all bytes of the character, not the first one.
let end_len_utf8 = text[pair.end..].chars().next().unwrap().len_utf8();
for class in &mut processing_classes[pair.start..pair.start + start_len_utf8] {
let end_char_len =
T::char_len(text.subrange(pair.end..text.len()).chars().next().unwrap());
for class in &mut processing_classes[pair.start..pair.start + start_char_len] {
*class = class_to_set;
}
for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] {
for class in &mut processing_classes[pair.end..pair.end + end_char_len] {
*class = class_to_set;
}
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
Expand All @@ -382,7 +387,7 @@ pub fn resolve_neutral<D: BidiDataSource>(

// This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
// about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
let nsm_start = pair.start + start_len_utf8;
let nsm_start = pair.start + start_char_len;
for idx in sequence.iter_forwards_from(nsm_start, pair.start_run) {
let class = original_classes[idx];
if class == BidiClass::NSM || processing_classes[idx] == BN {
Expand All @@ -391,7 +396,7 @@ pub fn resolve_neutral<D: BidiDataSource>(
break;
}
}
let nsm_end = pair.end + end_len_utf8;
let nsm_end = pair.end + end_char_len;
for idx in sequence.iter_forwards_from(nsm_end, pair.end_run) {
let class = original_classes[idx];
if class == BidiClass::NSM || processing_classes[idx] == BN {
Expand Down Expand Up @@ -477,37 +482,27 @@ struct BracketPair {
/// text source.
///
/// <https://www.unicode.org/reports/tr9/#BD16>
fn identify_bracket_pairs<D: BidiDataSource>(
text: &str,
fn identify_bracket_pairs<'a, T: TextSource<'a> + ?Sized, D: BidiDataSource>(
text: &'a T,
data_source: &D,
run_sequence: &IsolatingRunSequence,
original_classes: &[BidiClass],
) -> Vec<BracketPair> {
) -> Vec<BracketPair>
where
<T as TextSource<'a>>::CharIndexIter: Iterator<Item = (usize, char)>,
{
let mut ret = vec![];
let mut stack = vec![];

for (run_index, level_run) in run_sequence.runs.iter().enumerate() {
let slice = if let Some(slice) = text.get(level_run.clone()) {
slice
} else {
#[cfg(feature = "std")]
std::debug_assert!(
false,
"Found broken indices in level run: found indices {}..{} for string of length {}",
level_run.start,
level_run.end,
text.len()
);
return ret;
};

for (i, ch) in slice.char_indices() {
for (i, ch) in text.subrange(level_run.clone()).char_indices() {
let actual_index = level_run.start + i;

// All paren characters are ON.
// From BidiBrackets.txt:
// > The Unicode property value stability policy guarantees that characters
// > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
if original_classes[level_run.start + i] != BidiClass::ON {
if original_classes[actual_index] != BidiClass::ON {
continue;
}

Expand Down
Loading
Loading