Skip to content

Commit

Permalink
Handle embedded CMap encodings properly
Browse files Browse the repository at this point in the history
  • Loading branch information
jrmuizel committed Sep 7, 2023
1 parent 4127e14 commit deb3aea
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ include = ["src/**/*", "README.md"]
debug = true

[dependencies]
adobe-cmap-parser = "0.3.3"
adobe-cmap-parser = "0.4.0"
encoding = "0.2.33"
euclid = "0.20.5"
lopdf = {version = "0.30", default-features = false, features = ["nom_parser"]}
Expand Down
35 changes: 25 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
extern crate lopdf;

use adobe_cmap_parser::{ByteMapping, CodeRange, CIDRange};
use lopdf::content::Content;
use lopdf::*;
use euclid::*;
Expand Down Expand Up @@ -813,7 +814,7 @@ struct PdfCIDFont<'a> {
#[allow(dead_code)]
doc: &'a Document,
#[allow(dead_code)]
encoding: Option<Vec<u16>>,
encoding: ByteMapping,
to_unicode: Option<HashMap<u32, String>>,
widths: HashMap<CharCode, f64>, // should probably just use i32 here
default_width: Option<f64>, // only used for CID fonts and we should probably brake out the different font types
Expand Down Expand Up @@ -876,18 +877,20 @@ impl<'a> PdfCIDFont<'a> {
let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
dlog!("base_name {} {:?}", base_name, font);

match encoding {
let encoding = match encoding {
&Object::Name(ref name) => {
let name = pdf_to_utf8(name);
dlog!("encoding {:?}", name);
assert!(name == "Identity-H");
ByteMapping { codespace: vec![CodeRange{width: 2, start: 0, end: 0xffff }], cid: vec![CIDRange{ src_code_lo: 0, src_code_hi: 0xffff, dst_CID_lo: 0 }]}
}
&Object::Stream(ref stream) => {
let contents = get_contents(stream);
dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
adobe_cmap_parser::get_byte_mapping(&contents).unwrap()
}
_ => { panic!("unsupported encoding {:?}", encoding)}
}
};

// Sometimes a Type0 font might refer to the same underlying data as regular font. In this case we may be able to extract some encoding
// data.
Expand Down Expand Up @@ -928,7 +931,7 @@ impl<'a> PdfCIDFont<'a> {
}
}
}
PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding: None, default_width: Some(default_width as f64) }
PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding, default_width: Some(default_width as f64) }
}
}

Expand All @@ -953,13 +956,25 @@ impl<'a> PdfFont for PdfCIDFont<'a> {
}*/

fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
let p = iter.next();
if let Some(&c) = p {
let next = *iter.next().unwrap();
Some((((c as u32) << 8) | next as u32, 2))
} else {
None
let mut c = *iter.next()? as u32;
let mut code = None;
for width in 1..=4 {
for range in &self.encoding.codespace {
if c as u32 >= range.start && c as u32 <= range.end && range.width == width {
code = Some((c as u32, width));
break;
}
}
let next = *iter.next()?;
c = ((c as u32) << 8) | next as u32;
}
let code = code?;
for range in &self.encoding.cid {
if code.0 >= range.src_code_lo && code.0 <= range.src_code_hi {
return Some((code.0 + range.dst_CID_lo, code.1 as u8));
}
}
None
}
fn decode_char(&self, char: CharCode) -> String {
let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
Expand Down

0 comments on commit deb3aea

Please sign in to comment.