Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve mixed CJK/Latin linebreaking. #1986

Merged
merged 1 commit into from
Sep 6, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 62 additions & 18 deletions crates/epaint/src/text/text_layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -719,11 +719,11 @@ struct RowBreakCandidates {
/// is always the primary candidate.
space: Option<usize>,

/// Logograms (single character representing a whole word) are good candidates for line break.
logogram: Option<usize>,
/// Logograms (single character representing a whole word) or kana (Japanese hiragana and katakana) are good candidates for line break.
cjk: Option<usize>,

/// Kana (Japanese hiragana and katakana) may be line broken unless before a gyōtō kinsoku character.
kana: Option<usize>,
/// Breaking anywhere before a CJK character is acceptable too.
pre_cjk: Option<usize>,

/// Breaking at a dash is a super-
/// good idea.
Expand All @@ -744,37 +744,38 @@ impl RowBreakCandidates {
const NON_BREAKING_SPACE: char = '\u{A0}';
if chr.is_whitespace() && chr != NON_BREAKING_SPACE {
self.space = Some(index);
} else if is_cjk_ideograph(chr) {
self.logogram = Some(index);
} else if is_cjk(chr) && (glyphs.len() == 1 || is_cjk_break_allowed(glyphs[1].chr)) {
self.cjk = Some(index);
} else if chr == '-' {
self.dash = Some(index);
} else if chr.is_ascii_punctuation() {
self.punctuation = Some(index);
} else if is_kana(chr) && (glyphs.len() == 1 || !is_gyoto_kinsoku(glyphs[1].chr)) {
self.kana = Some(index);
} else if glyphs.len() > 1 && is_cjk(glyphs[1].chr) {
self.pre_cjk = Some(index);
}
self.any = Some(index);
}

fn has_word_boundary(&self) -> bool {
self.space.is_some() || self.logogram.is_some()
fn word_boundary(&self) -> Option<usize> {
[self.space, self.cjk, self.pre_cjk]
.into_iter()
.max()
.flatten()
}

fn has_good_candidate(&self, break_anywhere: bool) -> bool {
if break_anywhere {
self.any.is_some()
} else {
self.has_word_boundary()
self.word_boundary().is_some()
}
}

fn get(&self, break_anywhere: bool) -> Option<usize> {
if break_anywhere {
self.any
} else {
self.space
.or(self.kana)
.or(self.logogram)
self.word_boundary()
.or(self.dash)
.or(self.punctuation)
.or(self.any)
Expand All @@ -796,10 +797,15 @@ fn is_kana(c: char) -> bool {
}

#[inline]
fn is_gyoto_kinsoku(c: char) -> bool {
// Gyōtō (meaning "beginning of line") kinsoku characters in Japanese typesetting are characters that may not appear at the start of a line, according to kinsoku shori rules.
// The list of gyōtō kinsoku characters can be found at https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages#Characters_not_permitted_on_the_start_of_a_line.
")]}〕〉》」』】〙〗〟'\"⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠–〜?!‼⁇⁈⁉・、:;,。.".contains(c)
fn is_cjk(c: char) -> bool {
// TODO: Add support for Korean Hangul.
is_cjk_ideograph(c) || is_kana(c)
}

#[inline]
fn is_cjk_break_allowed(c: char) -> bool {
// See: https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages#Characters_not_permitted_on_the_start_of_a_line.
!")]}〕〉》」』】〙〗〟'\"⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠–〜?!‼⁇⁈⁉・、:;,。.".contains(c)
}

// ----------------------------------------------------------------------------
Expand All @@ -812,3 +818,41 @@ fn test_zero_max_width() {
let galley = super::layout(&mut fonts, layout_job.into());
assert_eq!(galley.rows.len(), 1);
}

#[test]
fn test_cjk() {
let mut fonts = FontsImpl::new(1.0, 1024, super::FontDefinitions::default());
let mut layout_job = LayoutJob::single_section(
"日本語とEnglishの混在した文章".into(),
super::TextFormat::default(),
);
layout_job.wrap.max_width = 90.0;
let galley = super::layout(&mut fonts, layout_job.into());
assert_eq!(
galley
.rows
.iter()
.map(|row| row.glyphs.iter().map(|g| g.chr).collect::<String>())
.collect::<Vec<_>>(),
vec!["日本語と", "Englishの混在", "した文章"]
);
}

#[test]
fn test_pre_cjk() {
let mut fonts = FontsImpl::new(1.0, 1024, super::FontDefinitions::default());
let mut layout_job = LayoutJob::single_section(
"日本語とEnglishの混在した文章".into(),
super::TextFormat::default(),
);
layout_job.wrap.max_width = 100.0;
let galley = super::layout(&mut fonts, layout_job.into());
assert_eq!(
galley
.rows
.iter()
.map(|row| row.glyphs.iter().map(|g| g.chr).collect::<String>())
.collect::<Vec<_>>(),
vec!["日本語とEnglish", "の混在した文章"]
);
}