Skip to content

Commit a01990c

Browse files
wada314topecongiro
authored andcommitted
Use Unicode-standard char width to wrap comments or strings. (rust-lang#3275)
1 parent 503cdde commit a01990c

11 files changed

+136
-59
lines changed

Diff for: Cargo.lock

+8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ rustc-ap-syntax = "306.0.0"
5353
rustc-ap-syntax_pos = "306.0.0"
5454
failure = "0.1.1"
5555
bytecount = "0.4"
56+
unicode-width = "0.1.5"
57+
unicode_categories = "0.1.1"
5658

5759
# A noop dependency that changes in the Rust repository, it's a bit of a hack.
5860
# See the `src/tools/rustc-workspace-hack/README.md` file in `rust-lang/rust`

Diff for: src/comment.rs

+23-17
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ use config::Config;
1919
use rewrite::RewriteContext;
2020
use shape::{Indent, Shape};
2121
use string::{rewrite_string, StringFormat};
22-
use utils::{count_newlines, first_line_width, last_line_width, trim_left_preserve_layout};
22+
use utils::{
23+
count_newlines, first_line_width, last_line_width, trim_left_preserve_layout, unicode_str_width,
24+
};
2325
use {ErrorKind, FormattingError};
2426

2527
fn is_custom_comment(comment: &str) -> bool {
@@ -264,7 +266,8 @@ fn identify_comment(
264266
) -> Option<String> {
265267
let style = comment_style(orig, false);
266268

267-
// Computes the len of line taking into account a newline if the line is part of a paragraph.
269+
// Computes the byte length of line taking into account a newline if the line is part of a
270+
// paragraph.
268271
fn compute_len(orig: &str, line: &str) -> usize {
269272
if orig.len() > line.len() {
270273
if orig.as_bytes()[line.len()] == b'\r' {
@@ -498,7 +501,7 @@ struct CommentRewrite<'a> {
498501
item_block: Option<ItemizedBlock>,
499502
comment_line_separator: String,
500503
indent_str: String,
501-
max_chars: usize,
504+
max_width: usize,
502505
fmt_indent: Indent,
503506
fmt: StringFormat<'a>,
504507

@@ -520,7 +523,7 @@ impl<'a> CommentRewrite<'a> {
520523
comment_style(orig, config.normalize_comments()).to_str_tuplet()
521524
};
522525

523-
let max_chars = shape
526+
let max_width = shape
524527
.width
525528
.checked_sub(closer.len() + opener.len())
526529
.unwrap_or(1);
@@ -534,7 +537,7 @@ impl<'a> CommentRewrite<'a> {
534537
code_block_attr: None,
535538
item_block: None,
536539
comment_line_separator: format!("{}{}", indent_str, line_start),
537-
max_chars,
540+
max_width,
538541
indent_str,
539542
fmt_indent,
540543

@@ -543,7 +546,7 @@ impl<'a> CommentRewrite<'a> {
543546
closer: "",
544547
line_start,
545548
line_end: "",
546-
shape: Shape::legacy(max_chars, fmt_indent),
549+
shape: Shape::legacy(max_width, fmt_indent),
547550
trim_end: true,
548551
config,
549552
},
@@ -583,14 +586,14 @@ impl<'a> CommentRewrite<'a> {
583586

584587
if let Some(ref ib) = self.item_block {
585588
// the last few lines are part of an itemized block
586-
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
589+
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
587590
let item_fmt = ib.create_string_format(&self.fmt);
588591
self.result.push_str(&self.comment_line_separator);
589592
self.result.push_str(&ib.opener);
590593
match rewrite_string(
591594
&ib.trimmed_block_as_string(),
592595
&item_fmt,
593-
self.max_chars.saturating_sub(ib.indent),
596+
self.max_width.saturating_sub(ib.indent),
594597
) {
595598
Some(s) => self.result.push_str(&Self::join_block(
596599
&s,
@@ -626,14 +629,14 @@ impl<'a> CommentRewrite<'a> {
626629
return false;
627630
}
628631
self.is_prev_line_multi_line = false;
629-
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
632+
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
630633
let item_fmt = ib.create_string_format(&self.fmt);
631634
self.result.push_str(&self.comment_line_separator);
632635
self.result.push_str(&ib.opener);
633636
match rewrite_string(
634637
&ib.trimmed_block_as_string(),
635638
&item_fmt,
636-
self.max_chars.saturating_sub(ib.indent),
639+
self.max_width.saturating_sub(ib.indent),
637640
) {
638641
Some(s) => self.result.push_str(&Self::join_block(
639642
&s,
@@ -710,8 +713,11 @@ impl<'a> CommentRewrite<'a> {
710713
}
711714
}
712715

713-
if self.fmt.config.wrap_comments() && line.len() > self.fmt.shape.width && !has_url(line) {
714-
match rewrite_string(line, &self.fmt, self.max_chars) {
716+
if self.fmt.config.wrap_comments()
717+
&& unicode_str_width(line) > self.fmt.shape.width
718+
&& !has_url(line)
719+
{
720+
match rewrite_string(line, &self.fmt, self.max_width) {
715721
Some(ref s) => {
716722
self.is_prev_line_multi_line = s.contains('\n');
717723
self.result.push_str(s);
@@ -721,8 +727,8 @@ impl<'a> CommentRewrite<'a> {
721727
// Remove the trailing space, then start rewrite on the next line.
722728
self.result.pop();
723729
self.result.push_str(&self.comment_line_separator);
724-
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
725-
match rewrite_string(line, &self.fmt, self.max_chars) {
730+
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
731+
match rewrite_string(line, &self.fmt, self.max_width) {
726732
Some(ref s) => {
727733
self.is_prev_line_multi_line = s.contains('\n');
728734
self.result.push_str(s);
@@ -743,20 +749,20 @@ impl<'a> CommentRewrite<'a> {
743749
// 1 = " "
744750
let offset = 1 + last_line_width(&self.result) - self.line_start.len();
745751
Shape {
746-
width: self.max_chars.saturating_sub(offset),
752+
width: self.max_width.saturating_sub(offset),
747753
indent: self.fmt_indent,
748754
offset: self.fmt.shape.offset + offset,
749755
}
750756
} else {
751-
Shape::legacy(self.max_chars, self.fmt_indent)
757+
Shape::legacy(self.max_width, self.fmt_indent)
752758
};
753759
} else {
754760
if line.is_empty() && self.result.ends_with(' ') && !is_last {
755761
// Remove space if this is an empty comment or a doc comment.
756762
self.result.pop();
757763
}
758764
self.result.push_str(line);
759-
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
765+
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
760766
self.is_prev_line_multi_line = false;
761767
}
762768

Diff for: src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ extern crate serde_json;
2929
extern crate syntax;
3030
extern crate syntax_pos;
3131
extern crate toml;
32+
extern crate unicode_categories;
3233
extern crate unicode_segmentation;
34+
extern crate unicode_width;
3335

3436
use std::cell::RefCell;
3537
use std::collections::HashMap;

Diff for: src/overflow.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,8 @@ impl<'a> Context<'a> {
431431
};
432432

433433
if let Some(rewrite) = rewrite {
434-
let rewrite_first_line = Some(rewrite[..first_line_width(&rewrite)].to_owned());
434+
// splitn(2, *).next().unwrap() is always safe.
435+
let rewrite_first_line = Some(rewrite.splitn(2, '\n').next().unwrap().to_owned());
435436
last_list_item.item = rewrite_first_line;
436437
Some(rewrite)
437438
} else {

Diff for: src/string.rs

+44-26
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@
1111
// Format string literals.
1212

1313
use regex::Regex;
14+
use unicode_categories::UnicodeCategories;
1415
use unicode_segmentation::UnicodeSegmentation;
1516

1617
use config::Config;
1718
use shape::Shape;
18-
use utils::wrap_str;
19+
use utils::{unicode_str_width, wrap_str};
1920

2021
const MIN_STRING: usize = 10;
2122

@@ -53,7 +54,7 @@ impl<'a> StringFormat<'a> {
5354
/// indentation into account.
5455
///
5556
/// If we cannot put at least a single character per line, the rewrite won't succeed.
56-
fn max_chars_with_indent(&self) -> Option<usize> {
57+
fn max_width_with_indent(&self) -> Option<usize> {
5758
Some(
5859
self.shape
5960
.width
@@ -62,10 +63,10 @@ impl<'a> StringFormat<'a> {
6263
)
6364
}
6465

65-
/// Like max_chars_with_indent but the indentation is not subtracted.
66+
/// Like max_width_with_indent but the indentation is not subtracted.
6667
/// This allows to fit more graphemes from the string on a line when
6768
/// SnippetState::EndWithLineFeed.
68-
fn max_chars_without_indent(&self) -> Option<usize> {
69+
fn max_width_without_indent(&self) -> Option<usize> {
6970
Some(self.config.max_width().checked_sub(self.line_end.len())?)
7071
}
7172
}
@@ -75,8 +76,8 @@ pub fn rewrite_string<'a>(
7576
fmt: &StringFormat<'a>,
7677
newline_max_chars: usize,
7778
) -> Option<String> {
78-
let max_chars_with_indent = fmt.max_chars_with_indent()?;
79-
let max_chars_without_indent = fmt.max_chars_without_indent()?;
79+
let max_width_with_indent = fmt.max_width_with_indent()?;
80+
let max_width_without_indent = fmt.max_width_without_indent()?;
8081
let indent_with_newline = fmt.shape.indent.to_string_with_newline(fmt.config);
8182
let indent_without_newline = fmt.shape.indent.to_string(fmt.config);
8283

@@ -99,11 +100,11 @@ pub fn rewrite_string<'a>(
99100

100101
// Snip a line at a time from `stripped_str` until it is used up. Push the snippet
101102
// onto result.
102-
let mut cur_max_chars = max_chars_with_indent;
103+
let mut cur_max_width = max_width_with_indent;
103104
let is_bareline_ok = fmt.line_start.is_empty() || is_whitespace(fmt.line_start);
104105
loop {
105106
// All the input starting at cur_start fits on the current line
106-
if graphemes.len() - cur_start <= cur_max_chars {
107+
if graphemes_width(&graphemes[cur_start..]) <= cur_max_width {
107108
for (i, grapheme) in graphemes[cur_start..].iter().enumerate() {
108109
if is_new_line(grapheme) {
109110
// take care of blank lines
@@ -123,7 +124,7 @@ pub fn rewrite_string<'a>(
123124

124125
// The input starting at cur_start needs to be broken
125126
match break_string(
126-
cur_max_chars,
127+
cur_max_width,
127128
fmt.trim_end,
128129
fmt.line_end,
129130
&graphemes[cur_start..],
@@ -133,7 +134,7 @@ pub fn rewrite_string<'a>(
133134
result.push_str(fmt.line_end);
134135
result.push_str(&indent_with_newline);
135136
result.push_str(fmt.line_start);
136-
cur_max_chars = newline_max_chars;
137+
cur_max_width = newline_max_chars;
137138
cur_start += len;
138139
}
139140
SnippetState::EndWithLineFeed(line, len) => {
@@ -143,11 +144,11 @@ pub fn rewrite_string<'a>(
143144
result.push_str(&line);
144145
if is_bareline_ok {
145146
// the next line can benefit from the full width
146-
cur_max_chars = max_chars_without_indent;
147+
cur_max_width = max_width_without_indent;
147148
} else {
148149
result.push_str(&indent_without_newline);
149150
result.push_str(fmt.line_start);
150-
cur_max_chars = max_chars_with_indent;
151+
cur_max_width = max_width_with_indent;
151152
}
152153
cur_start += len;
153154
}
@@ -226,9 +227,10 @@ fn not_whitespace_except_line_feed(g: &str) -> bool {
226227
is_new_line(g) || !is_whitespace(g)
227228
}
228229

229-
/// Break the input string at a boundary character around the offset `max_chars`. A boundary
230+
/// Break the input string at a boundary character around the offset `max_width`. A boundary
230231
/// character is either a punctuation or a whitespace.
231-
fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
232+
/// FIXME(issue#3281): We must follow UAX#14 algorithm instead of this.
233+
fn break_string(max_width: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
232234
let break_at = |index /* grapheme at index is included */| {
233235
// Take in any whitespaces to the left/right of `input[index]` while
234236
// preserving line feeds
@@ -272,19 +274,33 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
272274
}
273275
};
274276

277+
// find a first index where the unicode width of input[0..x] become > max_width
278+
let max_width_index_in_input = {
279+
let mut cur_width = 0;
280+
let mut cur_index = 0;
281+
for (i, grapheme) in input.iter().enumerate() {
282+
cur_width += unicode_str_width(grapheme);
283+
cur_index = i;
284+
if cur_width > max_width {
285+
break;
286+
}
287+
}
288+
cur_index
289+
};
290+
275291
// Find the position in input for breaking the string
276292
if line_end.is_empty()
277293
&& trim_end
278-
&& !is_whitespace(input[max_chars - 1])
279-
&& is_whitespace(input[max_chars])
294+
&& !is_whitespace(input[max_width_index_in_input - 1])
295+
&& is_whitespace(input[max_width_index_in_input])
280296
{
281297
// At a breaking point already
282298
// The line won't invalidate the rewriting because:
283299
// - no extra space needed for the line_end character
284300
// - extra whitespaces to the right can be trimmed
285-
return break_at(max_chars - 1);
301+
return break_at(max_width_index_in_input - 1);
286302
}
287-
if let Some(url_index_end) = detect_url(input, max_chars) {
303+
if let Some(url_index_end) = detect_url(input, max_width_index_in_input) {
288304
let index_plus_ws = url_index_end
289305
+ input[url_index_end..]
290306
.iter()
@@ -297,27 +313,28 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
297313
return SnippetState::LineEnd(input[..=index_plus_ws].concat(), index_plus_ws + 1);
298314
};
299315
}
300-
match input[0..max_chars]
316+
317+
match input[0..max_width_index_in_input]
301318
.iter()
302319
.rposition(|grapheme| is_whitespace(grapheme))
303320
{
304321
// Found a whitespace and what is on its left side is big enough.
305322
Some(index) if index >= MIN_STRING => break_at(index),
306323
// No whitespace found, try looking for a punctuation instead
307-
_ => match input[0..max_chars]
324+
_ => match input[0..max_width_index_in_input]
308325
.iter()
309326
.rposition(|grapheme| is_punctuation(grapheme))
310327
{
311328
// Found a punctuation and what is on its left side is big enough.
312329
Some(index) if index >= MIN_STRING => break_at(index),
313330
// Either no boundary character was found to the left of `input[max_chars]`, or the line
314331
// got too small. We try searching for a boundary character to the right.
315-
_ => match input[max_chars..]
332+
_ => match input[max_width_index_in_input..]
316333
.iter()
317334
.position(|grapheme| is_whitespace(grapheme) || is_punctuation(grapheme))
318335
{
319336
// A boundary was found after the line limit
320-
Some(index) => break_at(max_chars + index),
337+
Some(index) => break_at(max_width_index_in_input + index),
321338
// No boundary to the right, the input cannot be broken
322339
None => SnippetState::EndOfInput(input.concat()),
323340
},
@@ -335,10 +352,11 @@ fn is_whitespace(grapheme: &str) -> bool {
335352
}
336353

337354
fn is_punctuation(grapheme: &str) -> bool {
338-
match grapheme.as_bytes()[0] {
339-
b':' | b',' | b';' | b'.' => true,
340-
_ => false,
341-
}
355+
grapheme.chars().all(|c| c.is_punctuation_other())
356+
}
357+
358+
fn graphemes_width(graphemes: &[&str]) -> usize {
359+
graphemes.iter().map(|s| unicode_str_width(s)).sum()
342360
}
343361

344362
#[cfg(test)]

0 commit comments

Comments
 (0)