From c2b82878375037869c185bfd146407b7b59ba218 Mon Sep 17 00:00:00 2001 From: Heinenen Date: Mon, 12 Aug 2024 17:02:09 +0200 Subject: [PATCH] Detect reference cycles when parsing streams --- src/nom_parser.rs | 27 +++++++++++++++++---------- src/reader.rs | 41 +++++++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/src/nom_parser.rs b/src/nom_parser.rs index 3d7b7a4..e8d2d3b 100644 --- a/src/nom_parser.rs +++ b/src/nom_parser.rs @@ -3,6 +3,7 @@ use crate::content::*; use crate::error::XrefError; use crate::xref::*; use crate::Error; +use std::collections::HashSet; use std::str::{self, FromStr}; use nom::branch::alt; @@ -270,12 +271,12 @@ fn dictionary(input: &[u8]) -> NomResult { )(input) } -fn stream<'a>(input: &'a [u8], reader: &Reader) -> NomResult<'a, Object> { +fn stream<'a>(input: &'a [u8], reader: &Reader, already_seen: &mut HashSet) -> NomResult<'a, Object> { let (i, dict) = terminated(dictionary, tuple((space, tag(b"stream"), eol)))(input)?; if let Ok(length) = dict.get(b"Length").and_then(|value| { if let Ok(id) = value.as_reference() { - reader.get_object(id).and_then(|value| value.as_i64()) + reader.get_object(id, already_seen).and_then(|value| value.as_i64()) } else { value.as_i64() } @@ -326,14 +327,17 @@ pub fn direct_object(input: &[u8]) -> Option { strip_nom(_direct_object(input)) } -fn object<'a>(input: &'a [u8], reader: &Reader) -> NomResult<'a, Object> { - terminated(alt((|input| stream(input, reader), _direct_objects)), space)(input) +fn object<'a>(input: &'a [u8], reader: &Reader, already_seen: &mut HashSet) -> NomResult<'a, Object> { + terminated( + alt((|input| stream(input, reader, already_seen), _direct_objects)), + space, + )(input) } pub fn indirect_object( - input: &[u8], offset: usize, expected_id: Option, reader: &Reader, + input: &[u8], offset: usize, expected_id: Option, reader: &Reader, already_seen: &mut HashSet, ) -> crate::Result<(ObjectId, Object)> { - let (id, mut object) = _indirect_object(&input[offset..], offset, expected_id, reader)?; + let (id, mut object) = _indirect_object(&input[offset..], offset, expected_id, reader, already_seen)?; offset_stream(&mut object, offset); @@ -341,7 +345,7 @@ pub fn indirect_object( } fn _indirect_object( - input: &[u8], offset: usize, expected_id: Option, reader: &Reader, + input: &[u8], offset: usize, expected_id: Option, reader: &Reader, already_seen: &mut HashSet, ) -> crate::Result<(ObjectId, Object)> { let (i, (_, object_id)) = terminated(tuple((space, object_id)), pair(tag(b"obj"), space))(input).map_err(|_| Error::Parse { offset })?; @@ -352,8 +356,11 @@ fn _indirect_object( } let object_offset = input.len() - i.len(); - let (_, mut object) = terminated(|i| object(i, reader), tuple((space, opt(tag(b"endobj")), space)))(i) - .map_err(|_| Error::Parse { offset })?; + let (_, mut object) = terminated( + |i| object(i, reader, already_seen), + tuple((space, opt(tag(b"endobj")), space)), + )(i) + .map_err(|_| Error::Parse { offset })?; offset_stream(&mut object, object_offset); @@ -418,7 +425,7 @@ pub fn xref_and_trailer(input: &[u8], reader: &Reader) -> crate::Result<(Xref, D Ok((xref, trailer)) }), (|input| { - _indirect_object(input, 0, None, reader) + _indirect_object(input, 0, None, reader, &mut HashSet::new()) .map(|(_, obj)| { let res = match obj { Object::Stream(stream) => decode_xref_stream(stream), diff --git a/src/reader.rs b/src/reader.rs index b87b51c..d002024 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -2,7 +2,7 @@ use log::{error, warn}; use std::cmp; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; #[cfg(not(feature = "async"))] use std::fs::File; @@ -273,7 +273,7 @@ impl<'a> Reader<'a> { let entries_filter_map = |(_, entry): (&_, &_)| { if let XrefEntry::Normal { offset, .. } = *entry { let (object_id, mut object) = self - .read_object(offset as usize, None) + .read_object(offset as usize, None, &mut HashSet::new()) .map_err(|e| error!("Object load error: {:?}", e)) .ok()?; if let Some(filter_func) = filter_func { @@ -363,12 +363,22 @@ impl<'a> Reader<'a> { let object = self.document.get_object(object_id)?; let stream = object.as_stream()?; - stream.dict.get(b"Length").and_then(|value| { - if let Ok(id) = value.as_reference() { - return self.document.get_object(id).and_then(Object::as_i64); - } - value.as_i64() - }) + stream + .dict + .get(b"Length") + .and_then(|value| { + if let Ok(id) = value.as_reference() { + return self.document.get_object(id).and_then(Object::as_i64); + } + value.as_i64() + }) + .map_err(|err| { + error!( + "Stream dictionary of '{} {} R' is missing the Length entry", + object_id.0, object_id.1 + ); + err + }) } /// Get object offset by object id. @@ -386,19 +396,26 @@ impl<'a> Reader<'a> { } } - pub fn get_object(&self, id: ObjectId) -> Result { + pub fn get_object(&self, id: ObjectId, already_seen: &mut HashSet) -> Result { + if already_seen.contains(&id) { + warn!("reference cycle detected resolving object {} {}", id.0, id.1); + return Err(Error::ReferenceCycle); + } + already_seen.insert(id); let offset = self.get_offset(id)?; - let (_, obj) = self.read_object(offset as usize, Some(id))?; + let (_, obj) = self.read_object(offset as usize, Some(id), already_seen)?; Ok(obj) } - fn read_object(&self, offset: usize, expected_id: Option) -> Result<(ObjectId, Object)> { + fn read_object( + &self, offset: usize, expected_id: Option, already_seen: &mut HashSet, + ) -> Result<(ObjectId, Object)> { if offset > self.buffer.len() { return Err(Error::Offset(offset)); } - parser::indirect_object(self.buffer, offset, expected_id, self) + parser::indirect_object(self.buffer, offset, expected_id, self, already_seen) } fn get_xref_start(buffer: &[u8]) -> Result {