From f4201ef2cb89c28bcdb82cab292720eb66b75695 Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Mon, 10 Oct 2022 16:33:31 +0000 Subject: [PATCH] Handling of numbered markdown lists. Fixes issue #5416 --- src/comment.rs | 174 +++++++++++++++++++++--- tests/source/itemized-blocks/no_wrap.rs | 36 ++++- tests/source/itemized-blocks/wrap.rs | 36 ++++- tests/target/itemized-blocks/no_wrap.rs | 36 ++++- tests/target/itemized-blocks/wrap.rs | 62 ++++++++- 5 files changed, 319 insertions(+), 25 deletions(-) diff --git a/src/comment.rs b/src/comment.rs index bc0e8774f4949..85918ecc11645 100644 --- a/src/comment.rs +++ b/src/comment.rs @@ -432,12 +432,18 @@ impl CodeBlockAttribute { /// Block that is formatted as an item. /// -/// An item starts with either a star `*` a dash `-` a greater-than `>` or a plus '+'. +/// An item starts with either a star `*`, a dash `-`, a greater-than `>`, a plus '+', or a number +/// `12.` or `34)` (with at most 2 digits). An item represents CommonMark's ["list +/// items"](https://spec.commonmark.org/0.30/#list-items) and/or ["block +/// quotes"](https://spec.commonmark.org/0.30/#block-quotes), but note that only a subset of +/// CommonMark is recognized - see the doc comment of [`ItemizedBlock::get_marker_length`] for more +/// details. +/// /// Different level of indentation are handled by shrinking the shape accordingly. struct ItemizedBlock { /// the lines that are identified as part of an itemized block lines: Vec, - /// the number of characters (typically whitespaces) up to the item sigil + /// the number of characters (typically whitespaces) up to the item marker indent: usize, /// the string that marks the start of an item opener: String, @@ -446,37 +452,70 @@ struct ItemizedBlock { } impl ItemizedBlock { - /// Returns `true` if the line is formatted as an item - fn is_itemized_line(line: &str) -> bool { - let trimmed = line.trim_start(); + /// Checks whether the `trimmed` line includes an item marker. Returns `None` if there is no + /// marker. Returns the length of the marker (in bytes) if one is present. Note that the length + /// includes the whitespace that follows the marker, for example the marker in `"* list item"` + /// has the length of 2. + /// + /// This function recognizes item markers that correspond to CommonMark's + /// ["bullet list marker"](https://spec.commonmark.org/0.30/#bullet-list-marker), + /// ["block quote marker"](https://spec.commonmark.org/0.30/#block-quote-marker), and/or + /// ["ordered list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker). + /// + /// Compared to CommonMark specification, the number of digits that are allowed in an ["ordered + /// list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker) is more limited (to at + /// most 2 digits). Limiting the length of the marker helps reduce the risk of recognizing + /// arbitrary numbers as markers. See also + /// which gives the + /// following example where a number (i.e. "1868") doesn't signify an ordered list: + /// ```md + /// The Captain died in + /// 1868. He wes buried in... + /// ``` + fn get_marker_length(trimmed: &str) -> Option { + // https://spec.commonmark.org/0.30/#bullet-list-marker or + // https://spec.commonmark.org/0.30/#block-quote-marker let itemized_start = ["* ", "- ", "> ", "+ "]; - itemized_start.iter().any(|s| trimmed.starts_with(s)) + if itemized_start.iter().any(|s| trimmed.starts_with(s)) { + return Some(2); // All items in `itemized_start` have length 2. + } + + // https://spec.commonmark.org/0.30/#ordered-list-marker, where at most 2 digits are + // allowed. + for suffix in [". ", ") "] { + if let Some((prefix, _)) = trimmed.split_once(suffix) { + if prefix.len() <= 2 && prefix.chars().all(|c| char::is_ascii_digit(&c)) { + return Some(prefix.len() + suffix.len()); + } + } + } + + None // No markers found. } - /// Creates a new ItemizedBlock described with the given line. - /// The `is_itemized_line` needs to be called first. - fn new(line: &str) -> ItemizedBlock { - let space_to_sigil = line.chars().take_while(|c| c.is_whitespace()).count(); - // +2 = '* ', which will add the appropriate amount of whitespace to keep itemized - // content formatted correctly. - let mut indent = space_to_sigil + 2; + /// Creates a new `ItemizedBlock` described with the given `line`. + /// Returns `None` if `line` doesn't start an item. + fn new(line: &str) -> Option { + let marker_length = ItemizedBlock::get_marker_length(line.trim_start())?; + let space_to_marker = line.chars().take_while(|c| c.is_whitespace()).count(); + let mut indent = space_to_marker + marker_length; let mut line_start = " ".repeat(indent); // Markdown blockquote start with a "> " if line.trim_start().starts_with(">") { // remove the original +2 indent because there might be multiple nested block quotes // and it's easier to reason about the final indent by just taking the length - // of th new line_start. We update the indent because it effects the max width + // of the new line_start. We update the indent because it effects the max width // of each formatted line. line_start = itemized_block_quote_start(line, line_start, 2); indent = line_start.len(); } - ItemizedBlock { + Some(ItemizedBlock { lines: vec![line[indent..].to_string()], indent, opener: line[..indent].to_string(), line_start, - } + }) } /// Returns a `StringFormat` used for formatting the content of an item. @@ -495,7 +534,7 @@ impl ItemizedBlock { /// Returns `true` if the line is part of the current itemized block. /// If it is, then it is added to the internal lines list. fn add_line(&mut self, line: &str) -> bool { - if !ItemizedBlock::is_itemized_line(line) + if ItemizedBlock::get_marker_length(line.trim_start()).is_none() && self.indent <= line.chars().take_while(|c| c.is_whitespace()).count() { self.lines.push(line.to_string()); @@ -766,10 +805,11 @@ impl<'a> CommentRewrite<'a> { self.item_block = None; if let Some(stripped) = line.strip_prefix("```") { self.code_block_attr = Some(CodeBlockAttribute::new(stripped)) - } else if self.fmt.config.wrap_comments() && ItemizedBlock::is_itemized_line(line) { - let ib = ItemizedBlock::new(line); - self.item_block = Some(ib); - return false; + } else if self.fmt.config.wrap_comments() { + if let Some(ib) = ItemizedBlock::new(line) { + self.item_block = Some(ib); + return false; + } } if self.result == self.opener { @@ -2020,4 +2060,96 @@ fn main() { "#; assert_eq!(s, filter_normal_code(s_with_comment)); } + + #[test] + fn test_itemized_block_first_line_handling() { + fn run_test( + test_input: &str, + expected_line: &str, + expected_indent: usize, + expected_opener: &str, + expected_line_start: &str, + ) { + let block = ItemizedBlock::new(test_input).unwrap(); + assert_eq!(1, block.lines.len(), "test_input: {:?}", test_input); + assert_eq!( + expected_line, &block.lines[0], + "test_input: {:?}", + test_input + ); + assert_eq!( + expected_indent, block.indent, + "test_input: {:?}", + test_input + ); + assert_eq!( + expected_opener, &block.opener, + "test_input: {:?}", + test_input + ); + assert_eq!( + expected_line_start, &block.line_start, + "test_input: {:?}", + test_input + ); + } + + run_test("- foo", "foo", 2, "- ", " "); + run_test("* foo", "foo", 2, "* ", " "); + run_test("> foo", "foo", 2, "> ", "> "); + + run_test("1. foo", "foo", 3, "1. ", " "); + run_test("12. foo", "foo", 4, "12. ", " "); + run_test("1) foo", "foo", 3, "1) ", " "); + run_test("12) foo", "foo", 4, "12) ", " "); + + run_test(" - foo", "foo", 6, " - ", " "); + + // https://spec.commonmark.org/0.30 says: "A start number may begin with 0s": + run_test("0. foo", "foo", 3, "0. ", " "); + run_test("01. foo", "foo", 4, "01. ", " "); + } + + #[test] + fn test_itemized_block_nonobvious_markers_are_rejected() { + let test_inputs = vec![ + // Non-numeric item markers (e.g. `a.` or `iv.`) are not allowed by + // https://spec.commonmark.org/0.30/#ordered-list-marker. We also note that allowing + // them would risk misidentifying regular words as item markers. See also the + // discussion in https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990 + "word. rest of the paragraph.", + "a. maybe this is a list item? maybe not?", + "iv. maybe this is a list item? maybe not?", + // Numbers with 3 or more digits are not recognized as item markers, to avoid + // formatting the following example as a list: + // + // ``` + // The Captain died in + // 1868. He was buried in... + // ``` + "123. only 2-digit numbers are recognized as item markers.", + // Parens: + "123) giving some coverage to parens as well.", + "a) giving some coverage to parens as well.", + // https://spec.commonmark.org/0.30 says that "at least one space or tab is needed + // between the list marker and any following content": + "1.Not a list item.", + "1.2.3. Not a list item.", + "1)Not a list item.", + "-Not a list item.", + "+Not a list item.", + "+1 not a list item.", + // https://spec.commonmark.org/0.30 says: "A start number may not be negative": + "-1. Not a list item.", + "-1 Not a list item.", + ]; + for line in test_inputs.iter() { + let maybe_block = ItemizedBlock::new(line); + assert!( + maybe_block.is_none(), + "The following line shouldn't be classified as a list item: {}", + line + ); + } + } } diff --git a/tests/source/itemized-blocks/no_wrap.rs b/tests/source/itemized-blocks/no_wrap.rs index a7b6a10a01050..e5699e766848a 100644 --- a/tests/source/itemized-blocks/no_wrap.rs +++ b/tests/source/itemized-blocks/no_wrap.rs @@ -1,7 +1,7 @@ // rustfmt-normalize_comments: true // rustfmt-format_code_in_doc_comments: true -//! This is a list: +//! This is an itemized markdown list (see also issue #3224): //! * Outer //! * Outer //! * Inner @@ -13,6 +13,40 @@ //! - when the log level is info, the level name is green and the rest of the line is white //! - when the log level is debug, the whole line is white //! - when the log level is trace, the whole line is gray ("bright black") +//! +//! This is a numbered markdown list (see also issue #5416): +//! 1. Long long long long long long long long long long long long long long long long long line +//! 2. Another very long long long long long long long long long long long long long long long line +//! 3. Nested list +//! 1. Long long long long long long long long long long long long long long long long line +//! 2. Another very long long long long long long long long long long long long long long line +//! 4. Last item +//! +//! Using the ')' instead of '.' character after the number: +//! 1) Long long long long long long long long long long long long long long long long long line +//! 2) Another very long long long long long long long long long long long long long long long line +//! +//! Deep list that mixes various bullet and number formats: +//! 1. First level with a long long long long long long long long long long long long long long +//! long long long line +//! 2. First level with another very long long long long long long long long long long long long +//! long long long line +//! * Second level with a long long long long long long long long long long long long long +//! long long long line +//! * Second level with another very long long long long long long long long long long long +//! long long long line +//! 1) Third level with a long long long long long long long long long long long long long +//! long long long line +//! 2) Third level with another very long long long long long long long long long long +//! long long long long line +//! - Forth level with a long long long long long long long long long long long long +//! long long long long line +//! - Forth level with another very long long long long long long long long long long +//! long long long long line +//! 3) One more item at the third level +//! 4) Last item of the third level +//! * Last item of second level +//! 3. Last item of first level /// All the parameters ***except for `from_theater`*** should be inserted as sent by the remote /// theater, i.e., as passed to [`Theater::send`] on the remote actor: diff --git a/tests/source/itemized-blocks/wrap.rs b/tests/source/itemized-blocks/wrap.rs index 955cc698b79f2..768461a43f956 100644 --- a/tests/source/itemized-blocks/wrap.rs +++ b/tests/source/itemized-blocks/wrap.rs @@ -2,7 +2,7 @@ // rustfmt-format_code_in_doc_comments: true // rustfmt-max_width: 50 -//! This is a list: +//! This is an itemized markdown list (see also issue #3224): //! * Outer //! * Outer //! * Inner @@ -14,6 +14,40 @@ //! - when the log level is info, the level name is green and the rest of the line is white //! - when the log level is debug, the whole line is white //! - when the log level is trace, the whole line is gray ("bright black") +//! +//! This is a numbered markdown list (see also issue #5416): +//! 1. Long long long long long long long long long long long long long long long long long line +//! 2. Another very long long long long long long long long long long long long long long long line +//! 3. Nested list +//! 1. Long long long long long long long long long long long long long long long long line +//! 2. Another very long long long long long long long long long long long long long long line +//! 4. Last item +//! +//! Using the ')' instead of '.' character after the number: +//! 1) Long long long long long long long long long long long long long long long long long line +//! 2) Another very long long long long long long long long long long long long long long long line +//! +//! Deep list that mixes various bullet and number formats: +//! 1. First level with a long long long long long long long long long long long long long long +//! long long long line +//! 2. First level with another very long long long long long long long long long long long long +//! long long long line +//! * Second level with a long long long long long long long long long long long long long +//! long long long line +//! * Second level with another very long long long long long long long long long long long +//! long long long line +//! 1) Third level with a long long long long long long long long long long long long long +//! long long long line +//! 2) Third level with another very long long long long long long long long long long +//! long long long long line +//! - Forth level with a long long long long long long long long long long long long +//! long long long long line +//! - Forth level with another very long long long long long long long long long long +//! long long long long line +//! 3) One more item at the third level +//! 4) Last item of the third level +//! * Last item of second level +//! 3. Last item of first level // This example shows how to configure fern to output really nicely colored logs // - when the log level is error, the whole line is red diff --git a/tests/target/itemized-blocks/no_wrap.rs b/tests/target/itemized-blocks/no_wrap.rs index de88563827263..86818b4474591 100644 --- a/tests/target/itemized-blocks/no_wrap.rs +++ b/tests/target/itemized-blocks/no_wrap.rs @@ -1,7 +1,7 @@ // rustfmt-normalize_comments: true // rustfmt-format_code_in_doc_comments: true -//! This is a list: +//! This is an itemized markdown list (see also issue #3224): //! * Outer //! * Outer //! * Inner @@ -13,6 +13,40 @@ //! - when the log level is info, the level name is green and the rest of the line is white //! - when the log level is debug, the whole line is white //! - when the log level is trace, the whole line is gray ("bright black") +//! +//! This is a numbered markdown list (see also issue #5416): +//! 1. Long long long long long long long long long long long long long long long long long line +//! 2. Another very long long long long long long long long long long long long long long long line +//! 3. Nested list +//! 1. Long long long long long long long long long long long long long long long long line +//! 2. Another very long long long long long long long long long long long long long long line +//! 4. Last item +//! +//! Using the ')' instead of '.' character after the number: +//! 1) Long long long long long long long long long long long long long long long long long line +//! 2) Another very long long long long long long long long long long long long long long long line +//! +//! Deep list that mixes various bullet and number formats: +//! 1. First level with a long long long long long long long long long long long long long long +//! long long long line +//! 2. First level with another very long long long long long long long long long long long long +//! long long long line +//! * Second level with a long long long long long long long long long long long long long +//! long long long line +//! * Second level with another very long long long long long long long long long long long +//! long long long line +//! 1) Third level with a long long long long long long long long long long long long long +//! long long long line +//! 2) Third level with another very long long long long long long long long long long +//! long long long long line +//! - Forth level with a long long long long long long long long long long long long +//! long long long long line +//! - Forth level with another very long long long long long long long long long long +//! long long long long line +//! 3) One more item at the third level +//! 4) Last item of the third level +//! * Last item of second level +//! 3. Last item of first level /// All the parameters ***except for `from_theater`*** should be inserted as sent by the remote /// theater, i.e., as passed to [`Theater::send`] on the remote actor: diff --git a/tests/target/itemized-blocks/wrap.rs b/tests/target/itemized-blocks/wrap.rs index a4907303c9e1b..4826590ea59dd 100644 --- a/tests/target/itemized-blocks/wrap.rs +++ b/tests/target/itemized-blocks/wrap.rs @@ -2,7 +2,8 @@ // rustfmt-format_code_in_doc_comments: true // rustfmt-max_width: 50 -//! This is a list: +//! This is an itemized markdown list (see also +//! issue #3224): //! * Outer //! * Outer //! * Inner @@ -23,6 +24,65 @@ //! is white //! - when the log level is trace, the whole line //! is gray ("bright black") +//! +//! This is a numbered markdown list (see also +//! issue #5416): +//! 1. Long long long long long long long long +//! long long long long long long long long +//! long line +//! 2. Another very long long long long long long +//! long long long long long long long long +//! long line +//! 3. Nested list +//! 1. Long long long long long long long long +//! long long long long long long long long +//! line +//! 2. Another very long long long long long +//! long long long long long long long long +//! long line +//! 4. Last item +//! +//! Using the ')' instead of '.' character after +//! the number: +//! 1) Long long long long long long long long +//! long long long long long long long long +//! long line +//! 2) Another very long long long long long long +//! long long long long long long long long +//! long line +//! +//! Deep list that mixes various bullet and number +//! formats: +//! 1. First level with a long long long long long +//! long long long long long long long long +//! long long long long line +//! 2. First level with another very long long +//! long long long long long long long long +//! long long long long long line +//! * Second level with a long long long long +//! long long long long long long long long +//! long long long long line +//! * Second level with another very long long +//! long long long long long long long long +//! long long long long line +//! 1) Third level with a long long long +//! long long long long long long long +//! long long long long long long line +//! 2) Third level with another very long +//! long long long long long long long +//! long long long long long long line +//! - Forth level with a long long +//! long long long long long long +//! long long long long long long +//! long long line +//! - Forth level with another very +//! long long long long long long +//! long long long long long long +//! long long line +//! 3) One more item at the third level +//! 4) Last item of the third level +//! * Last item of second level +//! 3. Last item of first level // This example shows how to configure fern to // output really nicely colored logs