@@ -432,14 +432,18 @@ impl CodeBlockAttribute {
432
432
433
433
/// Block that is formatted as an item.
434
434
///
435
- /// An item starts with either a star `*`, a dash `-`, a greater-than `>`,
436
- /// or a number `12.` or `34)` (with at most 2 digits).
435
+ /// An item starts with either a star `*`, a dash `-`, a greater-than `>`, or a number `12.` or
436
+ /// `34)` (with at most 2 digits). An item represents CommonMark's ["list
437
+ /// items"](https://spec.commonmark.org/0.30/#list-items) and/or ["block
438
+ /// quotes"](https://spec.commonmark.org/0.30/#block-quotes), but note that only a subset of
439
+ /// CommonMark is recognized - see the doc comment of `ItemizedBlock::get_marker_length` for more
440
+ /// details.
437
441
///
438
442
/// Different level of indentation are handled by shrinking the shape accordingly.
439
443
struct ItemizedBlock {
440
444
/// the lines that are identified as part of an itemized block
441
445
lines : Vec < String > ,
442
- /// the number of characters (typically whitespaces) up to the item sigil
446
+ /// the number of characters (typically whitespaces) up to the item marker
443
447
indent : usize ,
444
448
/// the string that marks the start of an item
445
449
opener : String ,
@@ -448,12 +452,33 @@ struct ItemizedBlock {
448
452
}
449
453
450
454
impl ItemizedBlock {
451
- /// Returns the sigil's (e.g. "- ", "* ", or "1. ") length or None if there is no sigil.
452
- fn get_sigil_length ( trimmed : & str ) -> Option < usize > {
455
+ /// Checks whether the `trimmed` line includes an item marker. Returns `None` if there is no
456
+ /// marker. Returns the length of the marker if one is present. Note that the length includes
457
+ /// the whitespace that follows the marker, for example the marker in `"* list item"` has the
458
+ /// length of 2.
459
+ ///
460
+ /// This function recognizes item markers that correspond to CommonMark's
461
+ /// ["bullet list marker"](https://spec.commonmark.org/0.30/#bullet-list-marker),
462
+ /// ["block quote marker"](https://spec.commonmark.org/0.30/#block-quote-marker), and/or
463
+ /// ["ordered list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker).
464
+ ///
465
+ /// Compared to CommonMark specification, the number of digits that are allowed in an ["ordered
466
+ /// list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker) is more limited (to at
467
+ /// most 2 digits). Limiting the length of the marker helps reduce the risk of recognizing
468
+ /// arbitrary numbers as markers. See also
469
+ /// https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990 which gives the
470
+ /// following example where a number (i.e. "1868") doesn't signify an ordered list:
471
+ /// > The Captain died in
472
+ /// > 1868. He wes buried in...
473
+ fn get_marker_length ( trimmed : & str ) -> Option < usize > {
474
+ // https://spec.commonmark.org/0.30/#bullet-list-marker or
475
+ // https://spec.commonmark.org/0.30/#block-quote-marker
453
476
if trimmed. starts_with ( "* " ) || trimmed. starts_with ( "- " ) || trimmed. starts_with ( "> " ) {
454
477
return Some ( 2 ) ;
455
478
}
456
479
480
+ // https://spec.commonmark.org/0.30/#ordered-list-marker, where at most 2 digits are
481
+ // allowed.
457
482
for suffix in [ ". " , ") " ] {
458
483
if let Some ( ( prefix, _) ) = trimmed. split_once ( suffix) {
459
484
if prefix. len ( ) <= 2 && prefix. chars ( ) . all ( |c| char:: is_ascii_digit ( & c) ) {
@@ -462,32 +487,31 @@ impl ItemizedBlock {
462
487
}
463
488
}
464
489
465
- None
490
+ None // No markers found.
466
491
}
467
492
468
493
/// Creates a new ItemizedBlock described with the given `line`
469
494
/// or None if `line` doesn't start an item.
470
495
fn new ( line : & str ) -> Option < ItemizedBlock > {
471
- ItemizedBlock :: get_sigil_length ( line. trim_start ( ) ) . map ( |sigil_length| {
472
- let space_to_sigil = line. chars ( ) . take_while ( |c| c. is_whitespace ( ) ) . count ( ) ;
473
- let mut indent = space_to_sigil + sigil_length;
474
- let mut line_start = " " . repeat ( indent) ;
475
-
476
- // Markdown blockquote start with a "> "
477
- if line. trim_start ( ) . starts_with ( ">" ) {
478
- // remove the original +2 indent because there might be multiple nested block quotes
479
- // and it's easier to reason about the final indent by just taking the length
480
- // of the new line_start. We update the indent because it effects the max width
481
- // of each formatted line.
482
- line_start = itemized_block_quote_start ( line, line_start, 2 ) ;
483
- indent = line_start. len ( ) ;
484
- }
485
- ItemizedBlock {
486
- lines : vec ! [ line[ indent..] . to_string( ) ] ,
487
- indent,
488
- opener : line[ ..indent] . to_string ( ) ,
489
- line_start,
490
- }
496
+ let marker_length = ItemizedBlock :: get_marker_length ( line. trim_start ( ) ) ?;
497
+ let space_to_marker = line. chars ( ) . take_while ( |c| c. is_whitespace ( ) ) . count ( ) ;
498
+ let mut indent = space_to_marker + marker_length;
499
+ let mut line_start = " " . repeat ( indent) ;
500
+
501
+ // Markdown blockquote start with a "> "
502
+ if line. trim_start ( ) . starts_with ( ">" ) {
503
+ // remove the original +2 indent because there might be multiple nested block quotes
504
+ // and it's easier to reason about the final indent by just taking the length
505
+ // of the new line_start. We update the indent because it effects the max width
506
+ // of each formatted line.
507
+ line_start = itemized_block_quote_start ( line, line_start, 2 ) ;
508
+ indent = line_start. len ( ) ;
509
+ }
510
+ Some ( ItemizedBlock {
511
+ lines : vec ! [ line[ indent..] . to_string( ) ] ,
512
+ indent,
513
+ opener : line[ ..indent] . to_string ( ) ,
514
+ line_start,
491
515
} )
492
516
}
493
517
@@ -507,7 +531,7 @@ impl ItemizedBlock {
507
531
/// Returns `true` if the line is part of the current itemized block.
508
532
/// If it is, then it is added to the internal lines list.
509
533
fn add_line ( & mut self , line : & str ) -> bool {
510
- if ItemizedBlock :: get_sigil_length ( line. trim_start ( ) ) . is_none ( )
534
+ if ItemizedBlock :: get_marker_length ( line. trim_start ( ) ) . is_none ( )
511
535
&& self . indent <= line. chars ( ) . take_while ( |c| c. is_whitespace ( ) ) . count ( )
512
536
{
513
537
self . lines . push ( line. to_string ( ) ) ;
@@ -2058,27 +2082,30 @@ fn main() {
2058
2082
2059
2083
run_test ( "1. foo" , "foo" , 3 , "1. " , " " ) ;
2060
2084
run_test ( "12. foo" , "foo" , 4 , "12. " , " " ) ;
2085
+ run_test ( "1) foo" , "foo" , 3 , "1) " , " " ) ;
2086
+ run_test ( "12) foo" , "foo" , 4 , "12) " , " " ) ;
2061
2087
2062
2088
run_test ( " - foo" , "foo" , 6 , " - " , " " ) ;
2063
2089
}
2064
2090
2065
2091
#[ test]
2066
- fn test_itemized_block_nonobvious_sigils_are_rejected ( ) {
2092
+ fn test_itemized_block_nonobvious_markers_are_rejected ( ) {
2067
2093
let test_inputs = vec ! [
2068
- // Non-numeric sigils (e.g. `a.` or `iv.`) are not supported, because of a risk of
2069
- // misidentifying regular words as sigils. See also the discussion in
2070
- // https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990
2094
+ // Non-numeric item markers (e.g. `a.` or `iv.`) are not allowed by
2095
+ // https://spec.commonmark.org/0.30/#ordered-list-marker. We also note that allowing
2096
+ // them would risk misidentifying regular words as item markers. See also the
2097
+ // discussion in https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990
2071
2098
"word. rest of the paragraph." ,
2072
2099
"a. maybe this is a list item? maybe not?" ,
2073
2100
"iv. maybe this is a list item? maybe not?" ,
2074
- // Numbers with 3 or more digits are not recognized as sigils , to avoid
2101
+ // Numbers with 3 or more digits are not recognized as item markers , to avoid
2075
2102
// formatting the following example as a list:
2076
2103
//
2077
2104
// ```
2078
2105
// The Captain died in
2079
- // 1868. He was buried in...
2106
+ // 1868. He was buried in...
2080
2107
// ```
2081
- "123. only 2-digit numbers are recognized as sigils ." ,
2108
+ "123. only 2-digit numbers are recognized as item markers ." ,
2082
2109
// Parens.
2083
2110
"123) giving some coverage to parens as well." ,
2084
2111
"a) giving some coverage to parens as well." ,
0 commit comments