Skip to content

Commit

Permalink
Improve detection of invalid UTF-8 sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Jan 26, 2023
1 parent ee082ce commit b58a962
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions arrow-json/src/raw/tape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,13 @@ pub struct Tape<'a> {

impl<'a> Tape<'a> {
/// Returns the string for the given string index
#[inline]
pub fn get_string(&self, idx: u32) -> &'a str {
let end_offset = self.string_offsets[idx as usize + 1];
let start_offset = self.string_offsets[idx as usize];
&self.strings[start_offset..end_offset]
// SAFETY:
// Verified offsets
unsafe { self.strings.get_unchecked(start_offset..end_offset) }
}

/// Returns the tape element at `idx`
Expand Down Expand Up @@ -459,6 +462,14 @@ impl TapeDecoder {
ArrowError::JsonError("Encountered non-UTF-8 data".to_string())
})?;

for offset in self.offsets.iter().copied() {
if !strings.is_char_boundary(offset) {
return Err(ArrowError::JsonError(
"Encountered truncated UTF-8 sequence".to_string(),
));
}
}

Ok(Tape {
strings,
elements: &self.elements,
Expand Down Expand Up @@ -783,8 +794,8 @@ mod tests {
assert_eq!(err, "Json error: Encountered non-UTF-8 data");

let mut decoder = TapeDecoder::new(16, 2);
decoder.decode(b"{\"hello\xe2\" : \"\x96\xa1world\"}").unwrap();
decoder.decode(b"{\"\xe2\" : \"\x96\xa1\"}").unwrap();
let err = decoder.finish().unwrap_err().to_string();
assert_eq!(err, "Json error: Encountered non-UTF-8 data");
assert_eq!(err, "Json error: Encountered truncated UTF-8 sequence");
}
}

0 comments on commit b58a962

Please sign in to comment.