Skip to content

Commit 5db1ff0

Browse files
committed
Adds lazy text floats
1 parent 840be4d commit 5db1ff0

File tree

6 files changed

+297
-31
lines changed

6 files changed

+297
-31
lines changed

src/lazy/text/buffer.rs

+153-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use crate::lazy::encoding::TextEncoding;
22
use crate::lazy::raw_stream_item::RawStreamItem;
33
use crate::lazy::text::encoded_value::EncodedTextValue;
4-
use crate::lazy::text::matched::{MatchedInt, MatchedValue};
4+
use crate::lazy::text::matched::{MatchedFloat, MatchedInt, MatchedValue};
55
use crate::lazy::text::parse_result::IonParseError;
66
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
77
use crate::lazy::text::value::LazyRawTextValue;
@@ -12,7 +12,7 @@ use nom::character::streaming::{char, digit1, one_of};
1212
use nom::combinator::{map, opt, peek, recognize, success, value};
1313
use nom::error::{ErrorKind, ParseError};
1414
use nom::multi::many0_count;
15-
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated};
15+
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
1616
use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser};
1717
use std::fmt::{Debug, Formatter};
1818
use std::iter::{Copied, Enumerate};
@@ -192,6 +192,12 @@ impl<'data> TextBufferView<'data> {
192192
EncodedTextValue::new(MatchedValue::Int(matched_int), self.offset(), length)
193193
},
194194
),
195+
map(
196+
match_and_length(Self::match_float),
197+
|(matched_float, length)| {
198+
EncodedTextValue::new(MatchedValue::Float(matched_float), self.offset(), length)
199+
},
200+
),
195201
// TODO: The other Ion types
196202
))
197203
.map(|encoded_value| LazyRawTextValue {
@@ -372,6 +378,111 @@ impl<'data> TextBufferView<'data> {
372378
fn take_base_16_digits1(self) -> IonMatchResult<'data> {
373379
take_while1(|b: u8| b.is_ascii_hexdigit())(self)
374380
}
381+
382+
/// Matches an Ion float of any syntax
383+
fn match_float(self) -> IonParseResult<'data, MatchedFloat> {
384+
alt((
385+
Self::match_float_special_value,
386+
Self::match_float_numeric_value,
387+
))(self)
388+
}
389+
390+
/// Matches special IEEE-754 floating point values, including +/- infinity and NaN.
391+
fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> {
392+
alt((
393+
value(MatchedFloat::NotANumber, tag("nan")),
394+
value(MatchedFloat::PositiveInfinity, tag("+inf")),
395+
value(MatchedFloat::NegativeInfinity, tag("-inf")),
396+
))(self)
397+
}
398+
399+
/// Matches numeric IEEE-754 floating point values.
400+
fn match_float_numeric_value(self) -> IonParseResult<'data, MatchedFloat> {
401+
terminated(
402+
recognize(pair(
403+
Self::match_number_with_optional_dot_and_digits,
404+
Self::match_float_exponent_marker_and_digits,
405+
)),
406+
Self::peek_stop_character,
407+
)
408+
.map(|_matched| MatchedFloat::Numeric)
409+
.parse(self)
410+
}
411+
412+
/// Matches a number that may or may not have a decimal place and trailing fractional digits.
413+
/// If a decimal place is present, there must also be trailing digits.
414+
/// For example:
415+
/// 1000
416+
/// 1000.559
417+
/// -25.2
418+
fn match_number_with_optional_dot_and_digits(self) -> IonMatchResult<'data> {
419+
recognize(tuple((
420+
opt(tag("-")),
421+
Self::match_base_10_digits_before_dot,
422+
opt(Self::match_dot_followed_by_base_10_digits),
423+
)))(self)
424+
}
425+
426+
/// In a float or decimal, matches the digits that are permitted before the decimal point.
427+
/// This includes either a single zero, or a non-zero followed by any sequence of digits.
428+
fn match_digits_before_dot(self) -> IonMatchResult<'data> {
429+
alt((
430+
tag("0"),
431+
recognize(pair(Self::match_leading_digit, Self::match_trailing_digits)),
432+
))(self)
433+
}
434+
435+
/// Matches a single non-zero base 10 digit.
436+
fn match_leading_digit(self) -> IonMatchResult<'data> {
437+
recognize(one_of("123456789"))(self)
438+
}
439+
440+
/// Matches any number of base 10 digits, allowing underscores at any position except the end.
441+
fn match_trailing_digits(self) -> IonMatchResult<'data> {
442+
recognize(many0_count(preceded(opt(char('_')), digit1)))(self)
443+
}
444+
445+
/// Recognizes a decimal point followed by any number of base-10 digits.
446+
fn match_dot_followed_by_base_10_digits(self) -> IonMatchResult<'data> {
447+
recognize(preceded(tag("."), opt(Self::match_digits_after_dot)))(self)
448+
}
449+
450+
/// Like `match_digits_before_dot`, but allows leading zeros.
451+
fn match_digits_after_dot(self) -> IonMatchResult<'data> {
452+
recognize(terminated(
453+
// Zero or more digits-followed-by-underscores
454+
many0_count(pair(digit1, char('_'))),
455+
// One or more digits
456+
digit1,
457+
))(self)
458+
}
459+
460+
/// Matches an `e` or `E` followed by an optional sign (`+` or `-`) followed by one or more
461+
/// base 10 digits.
462+
fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'data> {
463+
preceded(one_of("eE"), Self::match_exponent_sign_and_digits)(self)
464+
}
465+
466+
/// Recognizes the exponent portion of a decimal (everything after the 'd') or float
467+
/// (everything after the 'e'). This includes:
468+
/// * an optional '+' OR '-'
469+
/// * any number of decimal digits, which may:
470+
/// * have underscores in between them: `1_000_000`
471+
/// * have one or more leading zeros: `0005`
472+
fn match_exponent_sign_and_digits(self) -> IonMatchResult<'data> {
473+
recognize(pair(
474+
// Optional leading sign; if there's no sign, it's not negative.
475+
opt(Self::match_any_sign),
476+
Self::match_digits_after_dot,
477+
))(self)
478+
}
479+
480+
/// Matches `-` OR `+`.
481+
///
482+
/// This is used for matching exponent signs; most places in Ion do not allow `+`.
483+
pub fn match_any_sign(self) -> IonMatchResult<'data> {
484+
alt((tag("+"), tag("-")))(self)
485+
}
375486
}
376487

377488
// === nom trait implementations ===
@@ -602,7 +713,12 @@ mod tests {
602713
{
603714
let result = self.try_match(parser);
604715
// We expect this to fail for one reason or another
605-
result.unwrap_err();
716+
assert!(
717+
result.is_err(),
718+
"Expected a parse failure for input: {:?}\nResult: {:?}",
719+
self.input,
720+
result
721+
);
606722
}
607723
}
608724

@@ -729,4 +845,38 @@ mod tests {
729845
mismatch_int(input);
730846
}
731847
}
848+
849+
#[test]
850+
fn test_match_float() {
851+
fn match_float(input: &str) {
852+
MatchTest::new(input).expect_match(match_length(TextBufferView::match_float));
853+
}
854+
fn mismatch_float(input: &str) {
855+
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_float));
856+
}
857+
858+
let good_inputs = &[
859+
"0.0e0", "0E0", "0e0", "305e1", "305e+1", "305e-1", "305e100", "305e-100", "305e+100",
860+
"305.0e1", "0.279e3", "279e0", "279.5e0", "279.5E0",
861+
];
862+
for input in good_inputs {
863+
match_float(input);
864+
let negative = format!("-{input}");
865+
match_float(&negative);
866+
}
867+
868+
let bad_inputs = &[
869+
"305", // Integer
870+
"305e", // Has exponent delimiter but no exponent
871+
".305e", // No digits before the decimal point
872+
"305e0.5", // Fractional exponent
873+
"305e-0.5", // Negative fractional exponent
874+
"0305e1", // Leading zero
875+
"+305e1", // Leading plus sign
876+
"--305e1", // Multiple negative signs
877+
];
878+
for input in bad_inputs {
879+
mismatch_float(input);
880+
}
881+
}
732882
}

src/lazy/text/encoded_value.rs

+1
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ impl EncodedTextValue {
114114
MatchedValue::Null(ion_type) => ion_type,
115115
MatchedValue::Bool(_) => IonType::Bool,
116116
MatchedValue::Int(_) => IonType::Int,
117+
MatchedValue::Float(_) => IonType::Float,
117118
}
118119
}
119120

src/lazy/text/matched.rs

+52-5
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,17 @@
1919
//! use the previously recorded information to minimize the amount of information that needs to be
2020
//! re-discovered.
2121
22-
use crate::lazy::text::as_utf8::AsUtf8;
23-
use crate::lazy::text::buffer::TextBufferView;
24-
use crate::result::IonFailure;
25-
use crate::{Int, IonResult, IonType};
22+
use std::num::IntErrorKind;
23+
2624
use num_bigint::BigInt;
2725
use num_traits::Num;
2826
use smallvec::SmallVec;
29-
use std::num::IntErrorKind;
27+
28+
use crate::lazy::text::as_utf8::AsUtf8;
29+
use crate::lazy::text::buffer::TextBufferView;
30+
use crate::lazy::text::parse_result::InvalidInputError;
31+
use crate::result::IonFailure;
32+
use crate::{Int, IonError, IonResult, IonType};
3033

3134
/// A partially parsed Ion value.
3235
#[derive(Copy, Clone, Debug, PartialEq)]
@@ -35,6 +38,7 @@ pub(crate) enum MatchedValue {
3538
Null(IonType),
3639
Bool(bool),
3740
Int(MatchedInt),
41+
Float(MatchedFloat),
3842
// TODO: ...the other types
3943
}
4044

@@ -107,3 +111,46 @@ impl MatchedInt {
107111
Ok(int)
108112
}
109113
}
114+
115+
/// A partially parsed Ion float.
116+
#[derive(Copy, Clone, Debug, PartialEq)]
117+
pub(crate) enum MatchedFloat {
118+
/// `+inf`
119+
PositiveInfinity,
120+
/// `-inf`
121+
NegativeInfinity,
122+
/// `nan`
123+
NotANumber,
124+
/// Any numeric float value
125+
Numeric,
126+
}
127+
128+
impl MatchedFloat {
129+
// Floats that take more than 32 bytes of text to represent will heap allocate a larger buffer.
130+
const STACK_ALLOC_BUFFER_CAPACITY: usize = 32;
131+
132+
pub fn read(&self, matched_input: TextBufferView) -> IonResult<f64> {
133+
use std::str::FromStr;
134+
135+
match self {
136+
MatchedFloat::PositiveInfinity => return Ok(f64::INFINITY),
137+
MatchedFloat::NegativeInfinity => return Ok(f64::NEG_INFINITY),
138+
MatchedFloat::NotANumber => return Ok(f64::NAN),
139+
MatchedFloat::Numeric => {} // fall through
140+
};
141+
142+
let mut sanitized: SmallVec<[u8; Self::STACK_ALLOC_BUFFER_CAPACITY]> =
143+
SmallVec::with_capacity(Self::STACK_ALLOC_BUFFER_CAPACITY);
144+
sanitized.extend(matched_input.bytes().iter().copied().filter(|b| *b != b'_'));
145+
146+
let text = sanitized.as_utf8(matched_input.offset())?;
147+
let float = f64::from_str(text).map_err(|e| {
148+
let error: IonError = InvalidInputError::new(matched_input)
149+
.with_description(format!("encountered an unexpected error ({:?})", e))
150+
.with_label("parsing a float")
151+
.into();
152+
error
153+
})?;
154+
Ok(float)
155+
}
156+
}

src/lazy/text/parse_result.rs

+45-23
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,25 @@ impl<'data> From<InvalidInputError<'data>> for IonParseError<'data> {
143143
}
144144
}
145145

146+
// We cannot provide an analogous impl for `Incomplete` because it is missing necessary data.
147+
impl<'data> From<InvalidInputError<'data>> for IonError {
148+
fn from(invalid_input_error: InvalidInputError) -> Self {
149+
let mut message = String::from(
150+
invalid_input_error
151+
.description()
152+
.unwrap_or("invalid Ion syntax encountered"),
153+
);
154+
if let Some(label) = invalid_input_error.label {
155+
message.push_str(" while ");
156+
message.push_str(label.as_ref());
157+
}
158+
let position = Position::with_offset(invalid_input_error.input.offset())
159+
.with_length(invalid_input_error.input.len());
160+
let decoding_error = DecodingError::new(message).with_position(position);
161+
IonError::Decoding(decoding_error)
162+
}
163+
}
164+
146165
impl<'data> From<nom::Err<IonParseError<'data>>> for IonParseError<'data> {
147166
fn from(value: Err<IonParseError<'data>>) -> Self {
148167
match value {
@@ -200,6 +219,31 @@ pub(crate) trait AddContext<'data, T> {
200219
) -> IonResult<(TextBufferView<'data>, T)>;
201220
}
202221

222+
impl<'data, T> AddContext<'data, T> for nom::Err<IonParseError<'data>> {
223+
fn with_context(
224+
self,
225+
label: impl Into<Cow<'static, str>>,
226+
input: TextBufferView<'data>,
227+
) -> IonResult<(TextBufferView<'data>, T)> {
228+
let ipe = IonParseError::from(self);
229+
ipe.with_context(label, input)
230+
}
231+
}
232+
233+
// Turns an IonParseError into an IonResult
234+
impl<'data, T> AddContext<'data, T> for IonParseError<'data> {
235+
fn with_context(
236+
self,
237+
label: impl Into<Cow<'static, str>>,
238+
input: TextBufferView<'data>,
239+
) -> IonResult<(TextBufferView<'data>, T)> {
240+
match self {
241+
IonParseError::Incomplete => IonResult::incomplete(label, input.offset()),
242+
IonParseError::Invalid(invalid_input_error) => Err(IonError::from(invalid_input_error)),
243+
}
244+
}
245+
}
246+
203247
impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> {
204248
fn with_context(
205249
self,
@@ -209,29 +253,7 @@ impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> {
209253
match self {
210254
// No change needed in the ok case
211255
Ok(matched) => Ok(matched),
212-
// If the error was an incomplete
213-
Err(e) => {
214-
// Nom error to IonParseError
215-
match IonParseError::from(e) {
216-
IonParseError::Incomplete => IonResult::incomplete(label, input.offset()),
217-
IonParseError::Invalid(invalid_input_error) => {
218-
dbg!(&invalid_input_error.backtrace);
219-
let mut message = String::from(
220-
invalid_input_error
221-
.description()
222-
.unwrap_or("invalid text Ion syntax"),
223-
);
224-
if let Some(label) = invalid_input_error.label {
225-
message.push_str(" while ");
226-
message.push_str(label.as_ref());
227-
}
228-
let position = Position::with_offset(invalid_input_error.input.offset())
229-
.with_length(invalid_input_error.input.len());
230-
let decoding_error = DecodingError::new(message).with_position(position);
231-
Err(IonError::Decoding(decoding_error))
232-
}
233-
}
234-
}
256+
Err(e) => e.with_context(label, input),
235257
}
236258
}
237259
}

0 commit comments

Comments
 (0)