diff --git a/Cargo.toml b/Cargo.toml index d58bdc9..6184fb9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,8 +23,8 @@ chrono = { version = "^0.4", optional = true, features = [ encoding_rs = "0.8.32" flate2 = "^1.0" image = { version = "^0.24", optional = true } +indexmap = "2.2.3" itoa = "^1.0" -linked-hash-map = "^0.5" log = "^0.4" md-5 = "0.10" nom = { version = "^7.1", optional = true } @@ -34,23 +34,22 @@ serde = { version = "1.0", features = ["derive"], optional = true } time = { version = "^0.3", features = ["formatting", "parsing"] } tokio = { version = "1", features = ["fs", "io-util"], optional = true } weezl = "0.1" -indexmap = "2.2.3" [dev-dependencies] -tempfile = "3.3" -serde_json = "1.0" clap = { version = "4.0", features = ["derive"] } -shellexpand = "3.0" env_logger = "0.10" +serde_json = "1.0" +shellexpand = "3.0" +tempfile = "3.3" [features] +async = ["tokio/rt-multi-thread", "tokio/macros"] chrono_time = ["chrono"] default = ["chrono_time", "nom_parser", "rayon"] embed_image = ["image"] nom_parser = ["nom"] pom_parser = ["pom"] serde = ["dep:serde"] -async = ["tokio/rt-multi-thread", "tokio/macros"] [[example]] name = "add_barcode" diff --git a/README.md b/README.md index 90596b3..28c127a 100644 --- a/README.md +++ b/README.md @@ -20,22 +20,22 @@ use lopdf::dictionary; use lopdf::{Document, Object, Stream}; use lopdf::content::{Content, Operation}; -// with_version specifes the PDF version this document complies with. +// `with_version` specifes the PDF version this document complies with. let mut doc = Document::with_version("1.5"); -// Object IDs are used for cross referencing in PDF documents. `lopdf` helps keep track of them -// for us. They are simple integers. -// Calls to `doc.new_object_id` and `doc.add_object` return an object id +// Object IDs are used for cross referencing in PDF documents. +// `lopdf` helps keep track of them for us. They are simple integers. +// Calls to `doc.new_object_id` and `doc.add_object` return an object ID. -// pages is the root node of the page tree +// "Pages" is the root node of the page tree. let pages_id = doc.new_object_id(); -// fonts are dictionaries. The type, subtype and basefont tags -// are straight out of the PDF reference manual +// Fonts are dictionaries. The "Type", "Subtype" and "BaseFont" tags +// are straight out of the PDF spec. // // The dictionary macro is a helper that allows complex -// key, value relationships to be represented in a simpler +// key-value relationships to be represented in a simpler // visual manner, similar to a match statement. -// Dictionary is linkedHashMap of byte vector, and object +// A dictionary is implemented as an IndexMap of Vec, and Object let font_id = doc.add_object(dictionary! { // type of dictionary "Type" => "Font", @@ -46,13 +46,13 @@ let font_id = doc.add_object(dictionary! { "BaseFont" => "Courier", }); -// font dictionaries need to be added into resource dictionaries -// in order to be used. +// Font dictionaries need to be added into resource +// dictionaries in order to be used. // Resource dictionaries can contain more than just fonts, -// but normally just contains fonts -// Only one resource dictionary is allowed per page tree root +// but normally just contains fonts. +// Only one resource dictionary is allowed per page tree root. let resources_id = doc.add_object(dictionary! { - // fonts are actually triplely nested dictionaries. Fun! + // Fonts are actually triplely nested dictionaries. Fun! "Font" => dictionary! { // F1 is the font name used when writing text. // It must be unique in the document. It does not @@ -61,75 +61,75 @@ let resources_id = doc.add_object(dictionary! { }, }); -// Content is a wrapper struct around an operations struct that contains a vector of operations -// The operations struct contains a vector of operations that match up with a particular PDF -// operator and operands. -// Reference the PDF reference for more details on these operators and operands. -// Note, the operators and operands are specified in a reverse order than they -// actually appear in the PDF file itself. +// `Content` is a wrapper struct around an operations struct that contains +// a vector of operations. The operations struct contains a vector of +// that match up with a particular PDF operator and operands. +// Refer to the PDF spec for more details on the operators and operands +// Note, the operators and operands are specified in a reverse order +// from how they actually appear in the PDF file itself. let content = Content { operations: vec![ - // BT begins a text element. it takes no operands + // BT begins a text element. It takes no operands. Operation::new("BT", vec![]), - // Tf specifies the font and font size. Font scaling is complicated in PDFs. Reference - // the reference for more info. - // The into() methods are defined based on their paired .from() methods (this - // functionality is built into rust), and are converting the provided values into - // An enum that represents the basic object types in PDF documents. + // Tf specifies the font and font size. + // Font scaling is complicated in PDFs. + // Refer to the spec for more info. + // The `into()` methods convert the types into + // an enum that represents the basic object types in PDF documents. Operation::new("Tf", vec!["F1".into(), 48.into()]), - // Td adjusts the translation components of the text matrix. When used for the first - // time after BT, it sets the initial text position on the page. + // Td adjusts the translation components of the text matrix. + // When used for the first time after BT, it sets the initial + // text position on the page. // Note: PDF documents have Y=0 at the bottom. Thus 600 to print text near the top. Operation::new("Td", vec![100.into(), 600.into()]), // Tj prints a string literal to the page. By default, this is black text that is // filled in. There are other operators that can produce various textual effects and // colors Operation::new("Tj", vec![Object::string_literal("Hello World!")]), - // ET ends the text element + // ET ends the text element. Operation::new("ET", vec![]), ], }; -// Streams are a dictionary followed by a sequence of bytes. What that sequence of bytes -// represents depends on context -// The stream dictionary is set internally to lopdf and normally doesn't +// Streams are a dictionary followed by a (possibly encoded) sequence of bytes. +// What that sequence of bytes represents, depends on the context. +// The stream dictionary is set internally by lopdf and normally doesn't // need to be manually manipulated. It contains keys such as -// Length, Filter, DecodeParams, etc -// -// content is a stream of encoded content data. +// Length, Filter, DecodeParams, etc. let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap())); // Page is a dictionary that represents one page of a PDF file. -// It has a type, parent and contents +// Its required fields are "Type", "Parent" and "Contents". let page_id = doc.add_object(dictionary! { "Type" => "Page", "Parent" => pages_id, "Contents" => content_id, }); -// Again, pages is the root of the page tree. The ID was already created -// at the top of the page, since we needed it to assign to the parent element of the page -// dictionary +// Again, "Pages" is the root of the page tree. The ID was already created +// at the top of the page, since we needed it to assign to the parent element +// of the page dictionary. // -// This is just the basic requirements for a page tree root object. There are also many -// additional entries that can be added to the dictionary if needed. Some of these can also be -// defined on the page dictionary itself, and not inherited from the page tree root. +// These are just the basic requirements for a page tree root object. +// There are also many additional entries that can be added to the dictionary, +// if needed. Some of these can also be defined on the page dictionary itself, +// and not inherited from the page tree root. let pages = dictionary! { // Type of dictionary "Type" => "Pages", - // Vector of page IDs in document. Normally would contain more than one ID and be produced - // using a loop of some kind + // Vector of page IDs in document. Normally would contain more than one ID + // and be produced using a loop of some kind. "Kids" => vec![page_id.into()], // Page count "Count" => 1, // ID of resources dictionary, defined earlier "Resources" => resources_id, - // a rectangle that defines the boundaries of the physical or digital media. This is the - // "Page Size" + // A rectangle that defines the boundaries of the physical or digital media. + // This is the "page size". "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()], }; -// using insert() here, instead of add_object() since the id is already known. +// Using `insert()` here, instead of `add_object()` since the ID is already known. doc.objects.insert(pages_id, Object::Dictionary(pages)); // Creating document catalog. @@ -139,8 +139,8 @@ let catalog_id = doc.add_object(dictionary! { "Pages" => pages_id, }); -// Root key in trailer is set here to ID of document catalog, -// remainder of trailer is set during doc.save(). +// The "Root" key in trailer is set to the ID of the document catalog, +// the remainder of the trailer is set during `doc.save()`. doc.trailer.set("Root", catalog_id); doc.compress(); @@ -207,7 +207,7 @@ pub fn generate_fake_document() -> Document { } fn main() -> std::io::Result<()> { - // Generate a stack of Documents to merge + // Generate a stack of Documents to merge. let documents = vec![ generate_fake_document(), generate_fake_document(), @@ -215,7 +215,7 @@ fn main() -> std::io::Result<()> { generate_fake_document(), ]; - // Define a starting max_id (will be used as start index for object_ids) + // Define a starting `max_id` (will be used as start index for object_ids). let mut max_id = 1; let mut pagenum = 1; // Collect all Documents Objects grouped by a map @@ -251,17 +251,17 @@ fn main() -> std::io::Result<()> { documents_objects.extend(doc.objects); } - // Catalog and Pages are mandatory + // "Catalog" and "Pages" are mandatory. let mut catalog_object: Option<(ObjectId, Object)> = None; let mut pages_object: Option<(ObjectId, Object)> = None; // Process all objects except "Page" type for (object_id, object) in documents_objects.iter() { - // We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects - // All other objects should be collected and inserted into the main Document + // We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects. + // All other objects should be collected and inserted into the main Document. match object.type_name().unwrap_or("") { "Catalog" => { - // Collect a first "Catalog" object and use it for the future "Pages" + // Collect a first "Catalog" object and use it for the future "Pages". catalog_object = Some(( if let Some((id, _)) = catalog_object { id @@ -301,7 +301,7 @@ fn main() -> std::io::Result<()> { } } - // If no "Pages" object found abort + // If no "Pages" object found, abort. if pages_object.is_none() { println!("Pages root not found."); @@ -320,7 +320,7 @@ fn main() -> std::io::Result<()> { } } - // If no "Catalog" found abort + // If no "Catalog" found, abort. if catalog_object.is_none() { println!("Catalog root not found."); @@ -370,10 +370,10 @@ fn main() -> std::io::Result<()> { // Reorder all new Document objects document.renumber_objects(); - //Set any Bookmarks to the First child if they are not set to a page + // Set any Bookmarks to the First child if they are not set to a page document.adjust_zero_pages(); - //Set all bookmarks to the PDF Object tree then set the Outlines to the Bookmark content map. + // Set all bookmarks to the PDF Object tree then set the Outlines to the Bookmark content map. if let Some(n) = document.build_outline() { if let Ok(x) = document.get_object_mut(catalog_object.0) { if let Object::Dictionary(ref mut dict) = x { @@ -384,9 +384,9 @@ fn main() -> std::io::Result<()> { document.compress(); - // Save the merged PDF + // Save the merged PDF. // Store file in current working directory. - // Note: Line is excluded when running tests + // Note: Line is excluded when running doc tests if false { document.save("merged.pdf").unwrap(); } @@ -439,8 +439,8 @@ use lopdf::Document; * Why does the library keep everything in memory as high-level objects until finally serializing the entire document? - Normally a PDF document won't be very large, ranging from tens of KB to hundreds of MB. Memory size is not a bottle neck for today's computer. - By keeping the whole document in memory, stream length can be pre-calculated, no need to use a reference object for the Length entry, - the resulting PDF file is smaller for distribution and faster for PDF consumers to process. + Normally, a PDF document won't be very large, ranging from tens of KB to hundreds of MB. Memory size is not a bottle neck for today's computer. + By keeping the whole document in memory, the stream length can be pre-calculated, no need to use a reference object for the Length entry. + The resulting PDF file is smaller for distribution and faster for PDF consumers to process. Producing is a one-time effort, while consuming is many more. diff --git a/examples/create.rs b/examples/create.rs index 2cb2115..4662433 100644 --- a/examples/create.rs +++ b/examples/create.rs @@ -7,11 +7,11 @@ fn main() { // with_version specifes the PDF version this document complies with. let mut doc = Document::with_version("1.5"); - // Object IDs are used for cross referencing in PDF documents. `lopdf` helps keep track of them - // for us. They are simple integers. - // Calls to `doc.new_object_id` and `doc.add_object` return an object id + // Object IDs are used for cross referencing in PDF documents. + // `lopdf` helps keep track of them for us. They are simple integers. + // Calls to `doc.new_object_id` and `doc.add_object` return an object IDs. - // pages is the root node of the page tree + // "Pages" is the root node of the page tree let pages_id = doc.new_object_id(); // fonts are dictionaries. The type, subtype and basefont tags @@ -20,7 +20,7 @@ fn main() { // The dictionary macro is a helper that allows complex // key, value relationships to be represented in a simpler // visual manner, similar to a match statement. - // Dictionary is linkedHashMap of byte vector, and object + // A dictionary is implemented as an IndexMap of Vec, and Object let font_id = doc.add_object(dictionary! { // type of dictionary "Type" => "Font", @@ -31,13 +31,12 @@ fn main() { "BaseFont" => "Courier", }); - // font dictionaries need to be added into resource dictionaries - // in order to be used. - // Resource dictionaries can contain more than just fonts, - // but normally just contains fonts + // Font dictionaries need to be added into resource dictionaries in order + // to be used. Resource dictionaries can contain more than just fonts, + // but normally just contains fonts. // Only one resource dictionary is allowed per page tree root let resources_id = doc.add_object(dictionary! { - // fonts are actually triplely nested dictionaries. Fun! + // Fonts are actually triplely nested dictionaries. Fun! "Font" => dictionary! { // F1 is the font name used when writing text. // It must be unique in the document. It does not @@ -49,18 +48,17 @@ fn main() { // Content is a wrapper struct around an operations struct that contains a vector of operations // The operations struct contains a vector of operations that match up with a particular PDF // operator and operands. - // Reference the PDF reference for more details on these operators and operands. + // Refer to the PDF spec for more details on these operators and operands. // Note, the operators and operands are specified in a reverse order than they // actually appear in the PDF file itself. let content = Content { operations: vec![ // BT begins a text element. it takes no operands Operation::new("BT", vec![]), - // Tf specifies the font and font size. Font scaling is complicated in PDFs. Reference - // the reference for more info. - // The info() methods are defined based on their paired .from() methods (this - // functionality is built into rust), and are converting the provided values into - // An enum that represents the basic object types in PDF documents. + // Tf specifies the font and font size. Font scaling is complicated in PDFs. + // Refer to the PDF spec for more info. + // The `into()` methods convert the types into + // an enum that represents the basic object types in PDF documents. Operation::new("Tf", vec!["F1".into(), 48.into()]), // Td adjusts the translation components of the text matrix. When used for the first // time after BT, it sets the initial text position on the page. @@ -76,16 +74,14 @@ fn main() { }; // Streams are a dictionary followed by a sequence of bytes. What that sequence of bytes - // represents depends on context - // The stream dictionary is set internally to lopdf and normally doesn't + // represents, depends on context. + // The stream dictionary is set internally by lopdf and normally doesn't // need to be manually nanipulated. It contains keys such as - // Length, Filter, DecodeParams, etc - // - // content is a stream of encoded content data. + // Length, Filter, DecodeParams, etc. let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap())); // Page is a dictionary that represents one page of a PDF file. - // It has a type, parent and contents + // Its required fields are "Type", "Parent" and "Contents". let page_id = doc.add_object(dictionary! { "Type" => "Page", "Parent" => pages_id, @@ -96,7 +92,7 @@ fn main() { // at the top of the page, since we needed it to assign to the parent element of the page // dictionary // - // This is just the basic requirements for a page tree root object. There are also many + // These are just the basic requirements for a page tree root object. There are also many // additional entries that can be added to the dictionary if needed. Some of these can also be // defined on the page dictionary itself, and not inherited from the page tree root. let pages = dictionary! { diff --git a/src/object.rs b/src/object.rs index bce0b93..670dcd3 100644 --- a/src/object.rs +++ b/src/object.rs @@ -1,5 +1,5 @@ use crate::{Document, Error, Result}; -use linked_hash_map::{self, Iter, IterMut, LinkedHashMap}; +use indexmap::IndexMap; use log::warn; use std::cmp::max; use std::fmt; @@ -10,7 +10,7 @@ pub type ObjectId = (u32, u16); /// Dictionary object. #[derive(Clone, Default, PartialEq)] -pub struct Dictionary(LinkedHashMap, Object>); +pub struct Dictionary(IndexMap, Object>); /// Stream object /// Warning - all streams must be indirect objects, while @@ -294,7 +294,7 @@ impl fmt::Debug for Object { impl Dictionary { pub fn new() -> Dictionary { - Dictionary(LinkedHashMap::new()) + Dictionary(IndexMap::new()) } pub fn has(&self, key: &[u8]) -> bool { @@ -332,7 +332,7 @@ impl Dictionary { } pub fn remove(&mut self, key: &[u8]) -> Option { - self.0.remove(key) + self.0.swap_remove(key) } pub fn type_name(&self) -> Result<&str> { @@ -345,11 +345,11 @@ impl Dictionary { self.get(b"Type").and_then(Object::as_name).ok() == Some(type_name) } - pub fn iter(&self) -> Iter<'_, Vec, Object> { + pub fn iter(&self) -> indexmap::map::Iter, Object> { self.0.iter() } - pub fn iter_mut(&mut self) -> IterMut<'_, Vec, Object> { + pub fn iter_mut(&mut self) -> indexmap::map::IterMut, Object> { self.0.iter_mut() } @@ -361,7 +361,7 @@ impl Dictionary { pub fn extend(&mut self, other: &Dictionary) { let keep_both_objects = - |new_dict: &mut LinkedHashMap, Object>, key: &Vec, value: &Object, old_value: &Object| { + |new_dict: &mut IndexMap, Object>, key: &Vec, value: &Object, old_value: &Object| { let mut final_array; match value { @@ -378,7 +378,7 @@ impl Dictionary { new_dict.insert(key.to_owned(), Object::Array(final_array)); }; - let mut new_dict = LinkedHashMap::with_capacity(other.0.len() + 1); + let mut new_dict = IndexMap::with_capacity(other.0.len() + 1); for (key, value) in other.0.iter() { if let Some(old_value) = self.0.get(key) { @@ -431,13 +431,13 @@ impl Dictionary { self.0 = new_dict; } - /// Return a reference to the inner [`LinkedHashMap`]. - pub fn as_hashmap(&self) -> &LinkedHashMap, Object> { + /// Return a reference to the inner [`IndexMap`]. + pub fn as_hashmap(&self) -> &IndexMap, Object> { &self.0 } - /// Return a mut reference to the inner [`LinkedHashMap`]. - pub fn as_hashmap_mut(&mut self) -> &mut LinkedHashMap, Object> { + /// Return a mut reference to the inner [`IndexMap`]. + pub fn as_hashmap_mut(&mut self) -> &mut IndexMap, Object> { &mut self.0 } } @@ -469,15 +469,33 @@ impl fmt::Debug for Dictionary { } } +impl IntoIterator for Dictionary { + type Item = (Vec, Object); + type IntoIter = indexmap::map::IntoIter, Object>; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + impl<'a> IntoIterator for &'a Dictionary { type Item = (&'a Vec, &'a Object); - type IntoIter = linked_hash_map::Iter<'a, Vec, Object>; + type IntoIter = indexmap::map::Iter<'a, Vec, Object>; fn into_iter(self) -> Self::IntoIter { self.0.iter() } } +impl<'a> IntoIterator for &'a mut Dictionary { + type Item = (&'a Vec, &'a mut Object); + type IntoIter = indexmap::map::IterMut<'a, Vec, Object>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter_mut() + } +} + use std::iter::FromIterator; impl>> FromIterator<(K, Object)> for Dictionary { fn from_iter>(iter: I) -> Self {