Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds LazyRawTextReader support for reading symbols #616

Merged
merged 13 commits into from
Aug 23, 2023
6 changes: 3 additions & 3 deletions src/binary/binary_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ impl<W: Write> IonWriter for BinaryWriter<W> {
panic!("Cannot set symbol ID ${symbol_id} as annotation. It is undefined.");
}
}
RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text),
RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()),
};
self.raw_writer.add_annotation(symbol_id);
}
Expand All @@ -145,7 +145,7 @@ impl<W: Write> IonWriter for BinaryWriter<W> {
));
}
}
RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text),
RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()),
};
self.raw_writer.write_symbol(symbol_id)
}
Expand All @@ -159,7 +159,7 @@ impl<W: Write> IonWriter for BinaryWriter<W> {
panic!("Cannot set symbol ID ${symbol_id} as field name. It is undefined.");
}
}
RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text),
RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()),
};
self.raw_writer.set_field_name(text);
}
Expand Down
191 changes: 172 additions & 19 deletions src/lazy/text/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ use std::slice::Iter;

use nom::branch::alt;
use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1};
use nom::character::streaming::{char, digit1, one_of};
use nom::combinator::{fail, map, opt, peek, recognize, success, value};
use nom::character::streaming::{char, digit1, one_of, satisfy};
use nom::combinator::{fail, map, not, opt, peek, recognize, success, value};
use nom::error::{ErrorKind, ParseError};
use nom::multi::many0_count;
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
Expand All @@ -16,9 +16,9 @@ use crate::lazy::encoding::TextEncoding;
use crate::lazy::raw_stream_item::RawStreamItem;
use crate::lazy::text::encoded_value::EncodedTextValue;
use crate::lazy::text::matched::{
MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedValue,
MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedSymbol, MatchedValue,
};
use crate::lazy::text::parse_result::IonParseError;
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError};
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
use crate::lazy::text::value::LazyRawTextValue;
use crate::result::DecodingError;
Expand Down Expand Up @@ -275,6 +275,16 @@ impl<'data> TextBufferView<'data> {
)
},
),
map(
match_and_length(Self::match_symbol),
|(matched_symbol, length)| {
EncodedTextValue::new(
MatchedValue::Symbol(matched_symbol),
self.offset(),
length,
)
},
),
// TODO: The other Ion types
))
.map(|encoded_value| LazyRawTextValue {
Expand Down Expand Up @@ -463,6 +473,7 @@ impl<'data> TextBufferView<'data> {
Self::match_float_numeric_value,
))(self)
}

/// Matches special IEEE-754 values, including +/- infinity and NaN.
fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> {
alt((
Expand Down Expand Up @@ -577,6 +588,109 @@ impl<'data> TextBufferView<'data> {
/// Returns a matched buffer and a boolean indicating whether any escaped characters were
/// found in the short string.
fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> {
Self::match_text_until_unescaped(self, b'\"')
}

fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
// TODO: implement long string matching
// The `fail` parser is a nom builtin that never matches.
fail(self)
}
Comment on lines +594 to +598
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ This placeholder method was moved from further down in the file.


/// Matches a symbol ID (`$28`), an identifier (`foo`), or a quoted symbol (`'foo'`).
fn match_symbol(self) -> IonParseResult<'data, MatchedSymbol> {
// TODO: operators
alt((
Self::match_symbol_id,
Self::match_identifier,
Self::match_quoted_symbol,
))(self)
}

/// Matches a symbol ID (`$28`).
fn match_symbol_id(self) -> IonParseResult<'data, MatchedSymbol> {
recognize(terminated(
// Discard a `$` and parse an integer representing the symbol ID.
// Note that symbol ID integers:
// * CANNOT have underscores in them. For example: `$1_0` is considered an identifier.
// * CAN have leading zeros. There's precedent for this in ion-java.
preceded(tag("$"), digit1),
// Peek at the next character to make sure it's unrelated to the symbol ID.
// The spec does not offer a formal definition of what ends a symbol ID.
// This checks for either a stop_character (which performs its own `peek()`)
// or a colon (":"), which could be a field delimiter (":") or the beginning of
// an annotation delimiter ('::').
alt((
// Each of the parsers passed to `alt` must have the same return type. `stop_character`
// returns a char instead of a &str, so we use `recognize()` to get a &str instead.
recognize(Self::peek_stop_character),
peek(tag(":")), // Field delimiter (":") or annotation delimiter ("::")
)),
))
.map(|_matched| MatchedSymbol::SymbolId)
.parse(self)
}

/// Matches an identifier (`foo`).
fn match_identifier(self) -> IonParseResult<'data, MatchedSymbol> {
let (remaining, identifier_text) = recognize(terminated(
pair(
Self::identifier_initial_character,
Self::identifier_trailing_characters,
),
not(Self::identifier_trailing_character),
))(self)?;
// Ion defines a number of keywords that are syntactically indistinguishable from
// identifiers. Keywords take precedence; we must ensure that any identifier we find
// is not actually a keyword.
const KEYWORDS: &[&str] = &["true", "false", "nan", "null"];
// In many situations, this check will not be necessary. Another type's parser will
// recognize the keyword as its own. (For example, `parse_boolean` would match the input
// text `false`.) However, because symbols can appear in annotations and the check for
// annotations precedes the parsing for all other types, we need this extra verification.
if KEYWORDS
.iter()
.any(|k| k.as_bytes() == identifier_text.bytes())
{
// Finding a keyword is not a fatal error, it just means that this parser doesn't match.
return Err(nom::Err::Error(IonParseError::Invalid(
InvalidInputError::new(self),
)));
}
Ok((remaining, MatchedSymbol::Identifier))
}

/// Matches any character that can appear at the start of an identifier.
fn identifier_initial_character(self) -> IonParseResult<'data, Self> {
recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphabetic()))))(self)
}

/// Matches any character that is legal in an identifier, though not necessarily at the beginning.
fn identifier_trailing_character(self) -> IonParseResult<'data, Self> {
recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphanumeric()))))(self)
}

/// Matches characters that are legal in an identifier, though not necessarily at the beginning.
fn identifier_trailing_characters(self) -> IonParseResult<'data, Self> {
recognize(many0_count(Self::identifier_trailing_character))(self)
}

/// Matches a quoted symbol (`'foo'`).
fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> {
delimited(char('\''), Self::match_quoted_symbol_body, char('\''))
.map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars))
.parse(self)
}

/// Returns a matched buffer and a boolean indicating whether any escaped characters were
/// found in the short string.
fn match_quoted_symbol_body(self) -> IonParseResult<'data, (Self, bool)> {
Self::match_text_until_unescaped(self, b'\'')
}

/// A helper method for matching bytes until the specified delimiter. Ignores any byte
/// (including the delimiter) that is prefaced by the escape character `\`.
fn match_text_until_unescaped(self, delimiter: u8) -> IonParseResult<'data, (Self, bool)> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ This method was previously match_short_string, but it's generally useful for both strings and symbols. match_short_string and match_quoted_symbol now call this.

let mut is_escaped = false;
let mut contains_escaped_chars = false;
for (index, byte) in self.bytes().iter().enumerate() {
Expand All @@ -590,20 +704,14 @@ impl<'data> TextBufferView<'data> {
contains_escaped_chars = true;
continue;
}
if *byte == b'\"' {
if *byte == delimiter {
let matched = self.slice(0, index);
let remaining = self.slice_to_end(index);
return Ok((remaining, (matched, contains_escaped_chars)));
}
}
Err(nom::Err::Incomplete(Needed::Unknown))
}

fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
// TODO: implement long string matching
// The `fail` parser is a nom builtin that never matches.
fail(self)
}
}

// === nom trait implementations ===
Expand Down Expand Up @@ -839,13 +947,17 @@ mod tests {
P: Parser<TextBufferView<'data>, O, IonParseError<'data>>,
{
let result = self.try_match(parser);
// We expect this to fail for one reason or another
assert!(
result.is_err(),
"Expected a parse failure for input: {:?}\nResult: {:?}",
self.input,
result
);
// We expect that only part of the input will match or that the entire
// input will be rejected outright.
if let Ok((_remaining, match_length)) = result {
assert_ne!(
match_length,
self.input.len() - 1,
"parser unexpectedly matched the complete input: '{:?}\nResult: {:?}",
self.input,
result
);
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ Prior to this change, this unit test method would assert that there was no match. However, it was possible for the parser to match part of the input and report success. Now this method requires that the parser match the entire test input to be considered a successful match.

}
}

Expand Down Expand Up @@ -1038,13 +1150,54 @@ mod tests {
r#"
hello"
"#,
// Missing a trailing quote
// Missing a closing quote
r#"
"hello
"#,
// Closing quote is escaped
r#"
"hello\"
"#,
];
for input in bad_inputs {
mismatch_string(input);
}
}

#[test]
fn test_match_symbol() {
fn match_symbol(input: &str) {
MatchTest::new(input).expect_match(match_length(TextBufferView::match_symbol));
}
fn mismatch_symbol(input: &str) {
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_symbol));
}

// These inputs have leading/trailing whitespace to make them more readable, but the string
// matcher doesn't accept whitespace. We'll trim each one before testing it.
Comment on lines +1176 to +1177
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outdated comment?

let good_inputs = &[
"'hello'",
"'😀😀😀'",
"'this has an escaped quote \\' right in the middle'",
"$308",
"$0",
"foo",
"name",
"$bar",
"_baz_quux",
];
for input in good_inputs {
match_symbol(input);
}

let bad_inputs = &[
"'hello", // No closing quote
"'hello\\'", // Closing quote is escaped
"$-8", // Negative SID
"nan", // Identifier that is also a keyword
];
for input in bad_inputs {
mismatch_symbol(input);
}
}
}
1 change: 1 addition & 0 deletions src/lazy/text/encoded_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ impl EncodedTextValue {
MatchedValue::Int(_) => IonType::Int,
MatchedValue::Float(_) => IonType::Float,
MatchedValue::String(_) => IonType::String,
MatchedValue::Symbol(_) => IonType::Symbol,
}
}

Expand Down
Loading