Skip to content

Commit

Permalink
revset: parse unicode XID_CONTINUE characters as symbol
Browse files Browse the repository at this point in the history
Tag and bookmark names are usually ASCII, but they occasionally include Latin
or Han characters.

This doesn't fix the serialization problem, but should mitigate jj-vcs#5359.
  • Loading branch information
yuja committed Jan 15, 2025
1 parent 7f76f50 commit 4bd4719
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 1 deletion.
5 changes: 4 additions & 1 deletion lib/src/revset.pest
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

whitespace = _{ " " | "\t" | "\r" | "\n" | "\x0c" }

identifier_part = @{ (ASCII_ALPHANUMERIC | "_" | "/")+ }
// XID_CONTINUE: https://www.unicode.org/reports/tr31/#Default_Identifier_Syntax
// +, -, .: often included in tag/bookmark name or version number
// /: sometimes used as a tag/bookmark namespace separator
identifier_part = @{ (XID_CONTINUE | "_" | "/")+ }
identifier = @{
identifier_part ~ (("." | "-" | "+") ~ identifier_part)*
}
Expand Down
30 changes: 30 additions & 0 deletions lib/src/revset_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,14 @@ mod tests {

#[test]
fn test_parse_identifier() {
// Integer is a symbol
assert_eq!(parse_into_kind("0"), Ok(ExpressionKind::Identifier("0")));
// Tag/bookmark name separated by /
assert_eq!(
parse_into_kind("foo_bar/baz"),
Ok(ExpressionKind::Identifier("foo_bar/baz"))
);

// Internal '.', '-', and '+' are allowed
assert_eq!(
parse_into_kind("foo.bar-v1+7"),
Expand Down Expand Up @@ -1178,6 +1186,12 @@ mod tests {

// Parse a parenthesized symbol
assert_eq!(parse_normalized("(foo)"), parse_normalized("foo"));

// Non-ASCII tag/bookmark name
assert_eq!(
parse_into_kind("柔術+jj"),
Ok(ExpressionKind::Identifier("柔術+jj"))
);
}

#[test]
Expand Down Expand Up @@ -1321,6 +1335,19 @@ mod tests {
parse_into_kind(r#""main@origin""#),
Ok(ExpressionKind::String("main@origin".to_owned()))
);

// Non-ASCII name
assert_eq!(
parse_into_kind("柔術@"),
Ok(ExpressionKind::AtWorkspace("柔術".to_owned()))
);
assert_eq!(
parse_into_kind("柔@術"),
Ok(ExpressionKind::RemoteSymbol {
name: "柔".to_owned(),
remote: "術".to_owned()
})
);
}

#[test]
Expand All @@ -1330,6 +1357,9 @@ mod tests {
assert!(aliases_map.insert("@", "none()").is_err());
assert!(aliases_map.insert("a@", "none()").is_err());
assert!(aliases_map.insert("a@b", "none()").is_err());
// Non-ASCII character isn't allowed in alias symbol. This rule can be
// relaxed if needed.
assert!(aliases_map.insert("柔術", "none()").is_err());
}

#[test]
Expand Down

0 comments on commit 4bd4719

Please sign in to comment.