-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds LazyRawTextReader
support for reading symbols
#616
Changes from all commits
e0a83d8
89f79aa
840be4d
5db1ff0
07d4a70
181e0a5
357ca8f
716ff34
e29fec5
8f79a36
4cb9b2b
54470d2
eba5913
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,8 +5,8 @@ use std::slice::Iter; | |
|
||
use nom::branch::alt; | ||
use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1}; | ||
use nom::character::streaming::{char, digit1, one_of}; | ||
use nom::combinator::{fail, map, opt, peek, recognize, success, value}; | ||
use nom::character::streaming::{char, digit1, one_of, satisfy}; | ||
use nom::combinator::{fail, map, not, opt, peek, recognize, success, value}; | ||
use nom::error::{ErrorKind, ParseError}; | ||
use nom::multi::many0_count; | ||
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; | ||
|
@@ -16,9 +16,9 @@ use crate::lazy::encoding::TextEncoding; | |
use crate::lazy::raw_stream_item::RawStreamItem; | ||
use crate::lazy::text::encoded_value::EncodedTextValue; | ||
use crate::lazy::text::matched::{ | ||
MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedValue, | ||
MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedSymbol, MatchedValue, | ||
}; | ||
use crate::lazy::text::parse_result::IonParseError; | ||
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; | ||
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; | ||
use crate::lazy::text::value::LazyRawTextValue; | ||
use crate::result::DecodingError; | ||
|
@@ -275,6 +275,16 @@ impl<'data> TextBufferView<'data> { | |
) | ||
}, | ||
), | ||
map( | ||
match_and_length(Self::match_symbol), | ||
|(matched_symbol, length)| { | ||
EncodedTextValue::new( | ||
MatchedValue::Symbol(matched_symbol), | ||
self.offset(), | ||
length, | ||
) | ||
}, | ||
), | ||
// TODO: The other Ion types | ||
)) | ||
.map(|encoded_value| LazyRawTextValue { | ||
|
@@ -463,6 +473,7 @@ impl<'data> TextBufferView<'data> { | |
Self::match_float_numeric_value, | ||
))(self) | ||
} | ||
|
||
/// Matches special IEEE-754 values, including +/- infinity and NaN. | ||
fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> { | ||
alt(( | ||
|
@@ -577,6 +588,109 @@ impl<'data> TextBufferView<'data> { | |
/// Returns a matched buffer and a boolean indicating whether any escaped characters were | ||
/// found in the short string. | ||
fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { | ||
Self::match_text_until_unescaped(self, b'\"') | ||
} | ||
|
||
fn match_long_string(self) -> IonParseResult<'data, MatchedString> { | ||
// TODO: implement long string matching | ||
// The `fail` parser is a nom builtin that never matches. | ||
fail(self) | ||
} | ||
|
||
/// Matches a symbol ID (`$28`), an identifier (`foo`), or a quoted symbol (`'foo'`). | ||
fn match_symbol(self) -> IonParseResult<'data, MatchedSymbol> { | ||
// TODO: operators | ||
alt(( | ||
Self::match_symbol_id, | ||
Self::match_identifier, | ||
Self::match_quoted_symbol, | ||
))(self) | ||
} | ||
|
||
/// Matches a symbol ID (`$28`). | ||
fn match_symbol_id(self) -> IonParseResult<'data, MatchedSymbol> { | ||
recognize(terminated( | ||
// Discard a `$` and parse an integer representing the symbol ID. | ||
// Note that symbol ID integers: | ||
// * CANNOT have underscores in them. For example: `$1_0` is considered an identifier. | ||
// * CAN have leading zeros. There's precedent for this in ion-java. | ||
preceded(tag("$"), digit1), | ||
// Peek at the next character to make sure it's unrelated to the symbol ID. | ||
// The spec does not offer a formal definition of what ends a symbol ID. | ||
// This checks for either a stop_character (which performs its own `peek()`) | ||
// or a colon (":"), which could be a field delimiter (":") or the beginning of | ||
// an annotation delimiter ('::'). | ||
alt(( | ||
// Each of the parsers passed to `alt` must have the same return type. `stop_character` | ||
// returns a char instead of a &str, so we use `recognize()` to get a &str instead. | ||
recognize(Self::peek_stop_character), | ||
peek(tag(":")), // Field delimiter (":") or annotation delimiter ("::") | ||
)), | ||
)) | ||
.map(|_matched| MatchedSymbol::SymbolId) | ||
.parse(self) | ||
} | ||
|
||
/// Matches an identifier (`foo`). | ||
fn match_identifier(self) -> IonParseResult<'data, MatchedSymbol> { | ||
let (remaining, identifier_text) = recognize(terminated( | ||
pair( | ||
Self::identifier_initial_character, | ||
Self::identifier_trailing_characters, | ||
), | ||
not(Self::identifier_trailing_character), | ||
))(self)?; | ||
// Ion defines a number of keywords that are syntactically indistinguishable from | ||
// identifiers. Keywords take precedence; we must ensure that any identifier we find | ||
// is not actually a keyword. | ||
const KEYWORDS: &[&str] = &["true", "false", "nan", "null"]; | ||
// In many situations, this check will not be necessary. Another type's parser will | ||
// recognize the keyword as its own. (For example, `parse_boolean` would match the input | ||
// text `false`.) However, because symbols can appear in annotations and the check for | ||
// annotations precedes the parsing for all other types, we need this extra verification. | ||
if KEYWORDS | ||
.iter() | ||
.any(|k| k.as_bytes() == identifier_text.bytes()) | ||
{ | ||
// Finding a keyword is not a fatal error, it just means that this parser doesn't match. | ||
return Err(nom::Err::Error(IonParseError::Invalid( | ||
InvalidInputError::new(self), | ||
))); | ||
} | ||
Ok((remaining, MatchedSymbol::Identifier)) | ||
} | ||
|
||
/// Matches any character that can appear at the start of an identifier. | ||
fn identifier_initial_character(self) -> IonParseResult<'data, Self> { | ||
recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphabetic()))))(self) | ||
} | ||
|
||
/// Matches any character that is legal in an identifier, though not necessarily at the beginning. | ||
fn identifier_trailing_character(self) -> IonParseResult<'data, Self> { | ||
recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphanumeric()))))(self) | ||
} | ||
|
||
/// Matches characters that are legal in an identifier, though not necessarily at the beginning. | ||
fn identifier_trailing_characters(self) -> IonParseResult<'data, Self> { | ||
recognize(many0_count(Self::identifier_trailing_character))(self) | ||
} | ||
|
||
/// Matches a quoted symbol (`'foo'`). | ||
fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> { | ||
delimited(char('\''), Self::match_quoted_symbol_body, char('\'')) | ||
.map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars)) | ||
.parse(self) | ||
} | ||
|
||
/// Returns a matched buffer and a boolean indicating whether any escaped characters were | ||
/// found in the short string. | ||
fn match_quoted_symbol_body(self) -> IonParseResult<'data, (Self, bool)> { | ||
Self::match_text_until_unescaped(self, b'\'') | ||
} | ||
|
||
/// A helper method for matching bytes until the specified delimiter. Ignores any byte | ||
/// (including the delimiter) that is prefaced by the escape character `\`. | ||
fn match_text_until_unescaped(self, delimiter: u8) -> IonParseResult<'data, (Self, bool)> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ This method was previously |
||
let mut is_escaped = false; | ||
let mut contains_escaped_chars = false; | ||
for (index, byte) in self.bytes().iter().enumerate() { | ||
|
@@ -590,20 +704,14 @@ impl<'data> TextBufferView<'data> { | |
contains_escaped_chars = true; | ||
continue; | ||
} | ||
if *byte == b'\"' { | ||
if *byte == delimiter { | ||
let matched = self.slice(0, index); | ||
let remaining = self.slice_to_end(index); | ||
return Ok((remaining, (matched, contains_escaped_chars))); | ||
} | ||
} | ||
Err(nom::Err::Incomplete(Needed::Unknown)) | ||
} | ||
|
||
fn match_long_string(self) -> IonParseResult<'data, MatchedString> { | ||
// TODO: implement long string matching | ||
// The `fail` parser is a nom builtin that never matches. | ||
fail(self) | ||
} | ||
} | ||
|
||
// === nom trait implementations === | ||
|
@@ -839,13 +947,17 @@ mod tests { | |
P: Parser<TextBufferView<'data>, O, IonParseError<'data>>, | ||
{ | ||
let result = self.try_match(parser); | ||
// We expect this to fail for one reason or another | ||
assert!( | ||
result.is_err(), | ||
"Expected a parse failure for input: {:?}\nResult: {:?}", | ||
self.input, | ||
result | ||
); | ||
// We expect that only part of the input will match or that the entire | ||
// input will be rejected outright. | ||
if let Ok((_remaining, match_length)) = result { | ||
assert_ne!( | ||
match_length, | ||
self.input.len() - 1, | ||
"parser unexpectedly matched the complete input: '{:?}\nResult: {:?}", | ||
self.input, | ||
result | ||
); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ Prior to this change, this unit test method would assert that there was no match. However, it was possible for the parser to match part of the input and report success. Now this method requires that the parser match the entire test input to be considered a successful match. |
||
} | ||
} | ||
|
||
|
@@ -1038,13 +1150,54 @@ mod tests { | |
r#" | ||
hello" | ||
"#, | ||
// Missing a trailing quote | ||
// Missing a closing quote | ||
r#" | ||
"hello | ||
"#, | ||
// Closing quote is escaped | ||
r#" | ||
"hello\" | ||
"#, | ||
]; | ||
for input in bad_inputs { | ||
mismatch_string(input); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_match_symbol() { | ||
fn match_symbol(input: &str) { | ||
MatchTest::new(input).expect_match(match_length(TextBufferView::match_symbol)); | ||
} | ||
fn mismatch_symbol(input: &str) { | ||
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_symbol)); | ||
} | ||
|
||
// These inputs have leading/trailing whitespace to make them more readable, but the string | ||
// matcher doesn't accept whitespace. We'll trim each one before testing it. | ||
Comment on lines
+1176
to
+1177
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Outdated comment? |
||
let good_inputs = &[ | ||
"'hello'", | ||
"'😀😀😀'", | ||
"'this has an escaped quote \\' right in the middle'", | ||
"$308", | ||
"$0", | ||
"foo", | ||
"name", | ||
"$bar", | ||
"_baz_quux", | ||
]; | ||
for input in good_inputs { | ||
match_symbol(input); | ||
} | ||
|
||
let bad_inputs = &[ | ||
"'hello", // No closing quote | ||
"'hello\\'", // Closing quote is escaped | ||
"$-8", // Negative SID | ||
"nan", // Identifier that is also a keyword | ||
]; | ||
for input in bad_inputs { | ||
mismatch_symbol(input); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🗺️ This placeholder method was moved from further down in the file.