Skip to content

Commit

Permalink
Replace LinkedHashMap with IndexMap (#293)
Browse files Browse the repository at this point in the history
* Replace LinkdHashMap with IndexMap

* Adapt examples
  • Loading branch information
Heinenen authored Aug 9, 2024
1 parent 03997e5 commit 79f5b41
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 106 deletions.
11 changes: 5 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ chrono = { version = "^0.4", optional = true, features = [
encoding_rs = "0.8.32"
flate2 = "^1.0"
image = { version = "^0.24", optional = true }
indexmap = "2.2.3"
itoa = "^1.0"
linked-hash-map = "^0.5"
log = "^0.4"
md-5 = "0.10"
nom = { version = "^7.1", optional = true }
Expand All @@ -34,23 +34,22 @@ serde = { version = "1.0", features = ["derive"], optional = true }
time = { version = "^0.3", features = ["formatting", "parsing"] }
tokio = { version = "1", features = ["fs", "io-util"], optional = true }
weezl = "0.1"
indexmap = "2.2.3"

[dev-dependencies]
tempfile = "3.3"
serde_json = "1.0"
clap = { version = "4.0", features = ["derive"] }
shellexpand = "3.0"
env_logger = "0.10"
serde_json = "1.0"
shellexpand = "3.0"
tempfile = "3.3"

[features]
async = ["tokio/rt-multi-thread", "tokio/macros"]
chrono_time = ["chrono"]
default = ["chrono_time", "nom_parser", "rayon"]
embed_image = ["image"]
nom_parser = ["nom"]
pom_parser = ["pom"]
serde = ["dep:serde"]
async = ["tokio/rt-multi-thread", "tokio/macros"]

[[example]]
name = "add_barcode"
Expand Down
128 changes: 64 additions & 64 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,22 @@ use lopdf::dictionary;
use lopdf::{Document, Object, Stream};
use lopdf::content::{Content, Operation};

// with_version specifes the PDF version this document complies with.
// `with_version` specifes the PDF version this document complies with.
let mut doc = Document::with_version("1.5");
// Object IDs are used for cross referencing in PDF documents. `lopdf` helps keep track of them
// for us. They are simple integers.
// Calls to `doc.new_object_id` and `doc.add_object` return an object id
// Object IDs are used for cross referencing in PDF documents.
// `lopdf` helps keep track of them for us. They are simple integers.
// Calls to `doc.new_object_id` and `doc.add_object` return an object ID.

// pages is the root node of the page tree
// "Pages" is the root node of the page tree.
let pages_id = doc.new_object_id();

// fonts are dictionaries. The type, subtype and basefont tags
// are straight out of the PDF reference manual
// Fonts are dictionaries. The "Type", "Subtype" and "BaseFont" tags
// are straight out of the PDF spec.
//
// The dictionary macro is a helper that allows complex
// key, value relationships to be represented in a simpler
// key-value relationships to be represented in a simpler
// visual manner, similar to a match statement.
// Dictionary is linkedHashMap of byte vector, and object
// A dictionary is implemented as an IndexMap of Vec<u8>, and Object
let font_id = doc.add_object(dictionary! {
// type of dictionary
"Type" => "Font",
Expand All @@ -46,13 +46,13 @@ let font_id = doc.add_object(dictionary! {
"BaseFont" => "Courier",
});

// font dictionaries need to be added into resource dictionaries
// in order to be used.
// Font dictionaries need to be added into resource
// dictionaries in order to be used.
// Resource dictionaries can contain more than just fonts,
// but normally just contains fonts
// Only one resource dictionary is allowed per page tree root
// but normally just contains fonts.
// Only one resource dictionary is allowed per page tree root.
let resources_id = doc.add_object(dictionary! {
// fonts are actually triplely nested dictionaries. Fun!
// Fonts are actually triplely nested dictionaries. Fun!
"Font" => dictionary! {
// F1 is the font name used when writing text.
// It must be unique in the document. It does not
Expand All @@ -61,75 +61,75 @@ let resources_id = doc.add_object(dictionary! {
},
});

// Content is a wrapper struct around an operations struct that contains a vector of operations
// The operations struct contains a vector of operations that match up with a particular PDF
// operator and operands.
// Reference the PDF reference for more details on these operators and operands.
// Note, the operators and operands are specified in a reverse order than they
// actually appear in the PDF file itself.
// `Content` is a wrapper struct around an operations struct that contains
// a vector of operations. The operations struct contains a vector of
// that match up with a particular PDF operator and operands.
// Refer to the PDF spec for more details on the operators and operands
// Note, the operators and operands are specified in a reverse order
// from how they actually appear in the PDF file itself.
let content = Content {
operations: vec![
// BT begins a text element. it takes no operands
// BT begins a text element. It takes no operands.
Operation::new("BT", vec![]),
// Tf specifies the font and font size. Font scaling is complicated in PDFs. Reference
// the reference for more info.
// The into() methods are defined based on their paired .from() methods (this
// functionality is built into rust), and are converting the provided values into
// An enum that represents the basic object types in PDF documents.
// Tf specifies the font and font size.
// Font scaling is complicated in PDFs.
// Refer to the spec for more info.
// The `into()` methods convert the types into
// an enum that represents the basic object types in PDF documents.
Operation::new("Tf", vec!["F1".into(), 48.into()]),
// Td adjusts the translation components of the text matrix. When used for the first
// time after BT, it sets the initial text position on the page.
// Td adjusts the translation components of the text matrix.
// When used for the first time after BT, it sets the initial
// text position on the page.
// Note: PDF documents have Y=0 at the bottom. Thus 600 to print text near the top.
Operation::new("Td", vec![100.into(), 600.into()]),
// Tj prints a string literal to the page. By default, this is black text that is
// filled in. There are other operators that can produce various textual effects and
// colors
Operation::new("Tj", vec![Object::string_literal("Hello World!")]),
// ET ends the text element
// ET ends the text element.
Operation::new("ET", vec![]),
],
};

// Streams are a dictionary followed by a sequence of bytes. What that sequence of bytes
// represents depends on context
// The stream dictionary is set internally to lopdf and normally doesn't
// Streams are a dictionary followed by a (possibly encoded) sequence of bytes.
// What that sequence of bytes represents, depends on the context.
// The stream dictionary is set internally by lopdf and normally doesn't
// need to be manually manipulated. It contains keys such as
// Length, Filter, DecodeParams, etc
//
// content is a stream of encoded content data.
// Length, Filter, DecodeParams, etc.
let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));

// Page is a dictionary that represents one page of a PDF file.
// It has a type, parent and contents
// Its required fields are "Type", "Parent" and "Contents".
let page_id = doc.add_object(dictionary! {
"Type" => "Page",
"Parent" => pages_id,
"Contents" => content_id,
});

// Again, pages is the root of the page tree. The ID was already created
// at the top of the page, since we needed it to assign to the parent element of the page
// dictionary
// Again, "Pages" is the root of the page tree. The ID was already created
// at the top of the page, since we needed it to assign to the parent element
// of the page dictionary.
//
// This is just the basic requirements for a page tree root object. There are also many
// additional entries that can be added to the dictionary if needed. Some of these can also be
// defined on the page dictionary itself, and not inherited from the page tree root.
// These are just the basic requirements for a page tree root object.
// There are also many additional entries that can be added to the dictionary,
// if needed. Some of these can also be defined on the page dictionary itself,
// and not inherited from the page tree root.
let pages = dictionary! {
// Type of dictionary
"Type" => "Pages",
// Vector of page IDs in document. Normally would contain more than one ID and be produced
// using a loop of some kind
// Vector of page IDs in document. Normally would contain more than one ID
// and be produced using a loop of some kind.
"Kids" => vec![page_id.into()],
// Page count
"Count" => 1,
// ID of resources dictionary, defined earlier
"Resources" => resources_id,
// a rectangle that defines the boundaries of the physical or digital media. This is the
// "Page Size"
// A rectangle that defines the boundaries of the physical or digital media.
// This is the "page size".
"MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
};

// using insert() here, instead of add_object() since the id is already known.
// Using `insert()` here, instead of `add_object()` since the ID is already known.
doc.objects.insert(pages_id, Object::Dictionary(pages));

// Creating document catalog.
Expand All @@ -139,8 +139,8 @@ let catalog_id = doc.add_object(dictionary! {
"Pages" => pages_id,
});

// Root key in trailer is set here to ID of document catalog,
// remainder of trailer is set during doc.save().
// The "Root" key in trailer is set to the ID of the document catalog,
// the remainder of the trailer is set during `doc.save()`.
doc.trailer.set("Root", catalog_id);
doc.compress();

Expand Down Expand Up @@ -207,15 +207,15 @@ pub fn generate_fake_document() -> Document {
}

fn main() -> std::io::Result<()> {
// Generate a stack of Documents to merge
// Generate a stack of Documents to merge.
let documents = vec![
generate_fake_document(),
generate_fake_document(),
generate_fake_document(),
generate_fake_document(),
];

// Define a starting max_id (will be used as start index for object_ids)
// Define a starting `max_id` (will be used as start index for object_ids).
let mut max_id = 1;
let mut pagenum = 1;
// Collect all Documents Objects grouped by a map
Expand Down Expand Up @@ -251,17 +251,17 @@ fn main() -> std::io::Result<()> {
documents_objects.extend(doc.objects);
}

// Catalog and Pages are mandatory
// "Catalog" and "Pages" are mandatory.
let mut catalog_object: Option<(ObjectId, Object)> = None;
let mut pages_object: Option<(ObjectId, Object)> = None;

// Process all objects except "Page" type
for (object_id, object) in documents_objects.iter() {
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
// All other objects should be collected and inserted into the main Document
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects.
// All other objects should be collected and inserted into the main Document.
match object.type_name().unwrap_or("") {
"Catalog" => {
// Collect a first "Catalog" object and use it for the future "Pages"
// Collect a first "Catalog" object and use it for the future "Pages".
catalog_object = Some((
if let Some((id, _)) = catalog_object {
id
Expand Down Expand Up @@ -301,7 +301,7 @@ fn main() -> std::io::Result<()> {
}
}

// If no "Pages" object found abort
// If no "Pages" object found, abort.
if pages_object.is_none() {
println!("Pages root not found.");

Expand All @@ -320,7 +320,7 @@ fn main() -> std::io::Result<()> {
}
}

// If no "Catalog" found abort
// If no "Catalog" found, abort.
if catalog_object.is_none() {
println!("Catalog root not found.");

Expand Down Expand Up @@ -370,10 +370,10 @@ fn main() -> std::io::Result<()> {
// Reorder all new Document objects
document.renumber_objects();

//Set any Bookmarks to the First child if they are not set to a page
// Set any Bookmarks to the First child if they are not set to a page
document.adjust_zero_pages();

//Set all bookmarks to the PDF Object tree then set the Outlines to the Bookmark content map.
// Set all bookmarks to the PDF Object tree then set the Outlines to the Bookmark content map.
if let Some(n) = document.build_outline() {
if let Ok(x) = document.get_object_mut(catalog_object.0) {
if let Object::Dictionary(ref mut dict) = x {
Expand All @@ -384,9 +384,9 @@ fn main() -> std::io::Result<()> {

document.compress();

// Save the merged PDF
// Save the merged PDF.
// Store file in current working directory.
// Note: Line is excluded when running tests
// Note: Line is excluded when running doc tests
if false {
document.save("merged.pdf").unwrap();
}
Expand Down Expand Up @@ -439,8 +439,8 @@ use lopdf::Document;

* Why does the library keep everything in memory as high-level objects until finally serializing the entire document?

Normally a PDF document won't be very large, ranging from tens of KB to hundreds of MB. Memory size is not a bottle neck for today's computer.
By keeping the whole document in memory, stream length can be pre-calculated, no need to use a reference object for the Length entry,
the resulting PDF file is smaller for distribution and faster for PDF consumers to process.
Normally, a PDF document won't be very large, ranging from tens of KB to hundreds of MB. Memory size is not a bottle neck for today's computer.
By keeping the whole document in memory, the stream length can be pre-calculated, no need to use a reference object for the Length entry.
The resulting PDF file is smaller for distribution and faster for PDF consumers to process.

Producing is a one-time effort, while consuming is many more.
Loading

0 comments on commit 79f5b41

Please sign in to comment.