Skip to content

Commit

Permalink
Merge pull request #908 from cellomath/main
Browse files Browse the repository at this point in the history
Added rust-style raw string syntax support for the Rhai Tokenizer
  • Loading branch information
schungx authored Aug 29, 2024
2 parents 5028afb + 72c84fd commit a0c2ebc
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 2 deletions.
133 changes: 132 additions & 1 deletion src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use std::prelude::v1::*;
use std::{
cell::RefCell,
char, fmt,
iter::{FusedIterator, Peekable},
iter::{repeat, FusedIterator, Peekable},
rc::Rc,
str::{Chars, FromStr},
};
Expand Down Expand Up @@ -1177,6 +1177,129 @@ pub trait InputStream {
}
}

/// _(internals)_ Parse a raw string literal. Raw string literals do not process any escapes.
/// Raw string literals do not process any escapes. They start with the character
/// `U+0072` (`r`), followed by fewer than 256 of the character `U+0023` (`#`) and a
/// `U+0022` (double-quote) character.
///
/// The _raw string body_ can contain any sequence of Unicode characters other than `U+000D` (CR).
/// It is terminated only by another `U+0022` (double-quote) character, followed by the same number of `U+0023` (`#`) characters that preceded the opening `U+0022` (double-quote) character.
///
/// All Unicode characters contained in the raw string body represent themselves,
/// the characters `U+0022` (double-quote) (except when followed by at least as
/// many `U+0023` (`#`) characters as were used to start the raw string literal) or
/// `U+005C` (`\`) do not have any special meaning.
///
/// Returns the parsed string.
///
/// # Returns
///
/// | Type | Return Value |
/// |---------------------------|:-----------------------------------:|
/// |`r"hello"` |`StringConstant("hello")` |
/// |`r"hello`_{EOF}_ |`LexError` |
/// |`r#" "hello" "`_{EOF}_ |`LexError` |
/// |`r#""hello""#` |`StringConstant("\"hello\"")` |
/// |`r##"hello #"# world"##` |`StringConstant("hello #\"# world")` |
/// |`r"R"` |`StringConstant("R")` |
/// |`r"\x52"` |`StringConstant("\\x52")` |
///
/// This function throws a `LexError` for an unterminated literal string at _{EOF}_.
pub fn parse_raw_string_literal(
stream: &mut (impl InputStream + ?Sized),
state: &mut TokenizeState,

Check warning on line 1210 in src/tokenizer.rs

View workflow job for this annotation

GitHub Actions / Build (ubuntu-latest, --features testing-environ,no_time,no_function,no_float,no_position,no_inde...

unused variable: `state`

Check warning on line 1210 in src/tokenizer.rs

View workflow job for this annotation

GitHub Actions / Build (ubuntu-latest, --features testing-environ,sync,no_time,no_function,no_float,no_position,no...

unused variable: `state`

Check warning on line 1210 in src/tokenizer.rs

View workflow job for this annotation

GitHub Actions / Build (ubuntu-latest, --features testing-environ,unchecked,serde,metadata,internals,debugging, st...

unused variable: `state`
pos: &mut Position,
) -> Result<(SmartString, Position), (LexError, Position)> {
let start = *pos;
let mut first_char = Position::NONE;

// Count the number of '#'s
let mut hash_count = 0;
while let Some('#') = stream.peek_next() {
stream.eat_next_and_advance(pos);
hash_count += 1;
}

// Match '"'
match stream.get_next() {
Some('"') => pos.advance(),
Some(c) => return Err((LERR::UnexpectedInput(c.to_string()), start)),
None => return Err((LERR::UnterminatedString, start))
}

// Match everything until the same number of '#'s are seen, prepended by a '"'

// Counts the number of '#' characters seen after a quotation mark.
// Becomes Some(0) after a quote is seen, but resets to None if a hash doesn't follow.
let mut seen_hashes: Option<u8> = None;
let mut result = SmartString::new_const();


loop {
let next_char = match stream.get_next() {
Some(ch) => ch,
None => return Err((LERR::UnterminatedString, start))
};

match (next_char, &mut seen_hashes) {
// Begin attempt to close string
('"', None) => {
if hash_count == 0 {
return Ok((result, first_char));
} else {
seen_hashes = Some(0);
}
}
// Restart attempt to close string
('"', Some(count)) => {
if hash_count == 0 {
return Ok((result, first_char));
} else {
// result.reserve(*count as usize+c.len());
result.push('"');
result.extend(repeat('#').take(*count as usize));
seen_hashes = Some(0);
}
}
// Continue attempt to close string
('#', Some(count)) => {
*count += 1;
if *count == hash_count {
return Ok((result, first_char));
}
}
// Fail to close the string - add previous quote and hashes
(c, Some(count)) => {
// result.reserve(*count as usize +1+c.len());
result.push('"');
result.extend(repeat('#').take(*count as usize));
result.push(c);
seen_hashes = None;
}
// Normal new character seen
(c, None) => result.push(c)
}

if next_char == '\n' {
pos.new_line();
} else {
pos.advance();
}

// Check string length
#[cfg(not(feature = "unchecked"))]
if let Some(max) = state.max_string_len {
if result.len() > max.get() {
return Err((LexError::StringTooLong(max.get()), start));
}
}

if first_char.is_none() {
first_char = *pos;
}
}
}

/// _(internals)_ Parse a string literal ended by a specified termination character.
/// Exported under the `internals` feature only.
///
Expand Down Expand Up @@ -1795,6 +1918,14 @@ fn get_next_token_inner(
);
}

// r - raw string literal
('r', '"' | '#') => {
return parse_raw_string_literal(stream, state, pos).map_or_else(
|(err, err_pos)| (Token::LexError(err.into()), err_pos),
|(result, ..)| (Token::StringConstant(result.into()), start_pos),
);
}

// ' - character literal
('\'', '\'') => {
return (
Expand Down
25 changes: 24 additions & 1 deletion tests/string.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use rhai::{Engine, EvalAltResult, ImmutableString, Scope, INT};
use rhai::{Engine, EvalAltResult, ImmutableString, LexError, ParseErrorType, Position, Scope, INT};

#[test]
fn test_string() {
Expand All @@ -17,6 +17,29 @@ fn test_string() {
assert_eq!(engine.eval::<String>(" `\r\nTest string: \\u2764\nhello,\\nworld!`").unwrap(), "Test string: \\u2764\nhello,\\nworld!");
assert_eq!(engine.eval::<String>(r#""Test string: \x58""#).unwrap(), "Test string: X");
assert_eq!(engine.eval::<String>(r#""\"hello\"""#).unwrap(), r#""hello""#);
assert_eq!(engine.eval::<String>(r#"r"Test""#).unwrap(), "Test");
assert_eq!(engine.eval::<String>(r#"r"Test string: \\u2764\nhello,\nworld!""#).unwrap(), r#"Test string: \\u2764\nhello,\nworld!"#);
assert_eq!(engine.eval::<String>(r###"r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##"###).unwrap(), r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##);
assert_eq!(engine.eval::<String>(r###"r##"Test string: "## + "\u2764""###).unwrap(), "Test string: ❤");
let bad_result = *engine.eval::<String>(r###"r#"Test string: \"##"###).unwrap_err();
if let EvalAltResult::ErrorParsing(parse_error, pos) = bad_result {
assert_eq!(parse_error, ParseErrorType::UnknownOperator("#".to_string()));
assert_eq!(pos, Position::new(1, 19));
} else {
panic!("Wrong error type: {}", bad_result);
}
let bad_result = *engine
.eval::<String>(
r###"r##"Test string:
\"#"###,
)
.unwrap_err();
if let EvalAltResult::ErrorParsing(parse_error, pos) = bad_result {
assert_eq!(parse_error, ParseErrorType::BadInput(LexError::UnterminatedString));
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("Wrong error type: {}", bad_result);
}

assert_eq!(engine.eval::<String>(r#""foo" + "bar""#).unwrap(), "foobar");

Expand Down

0 comments on commit a0c2ebc

Please sign in to comment.