-
Notifications
You must be signed in to change notification settings - Fork 944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pdfstream as cmap #283
Pdfstream as cmap #283
Changes from 13 commits
8ab2e28
c022358
8e4a82a
cc40af3
fa40043
b4c261b
f1a4dce
5a0d8db
5b21098
fe38695
3125d36
3f0f05d
3d549ea
abd685f
7c03d96
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ | |
from .encodingdb import name2unicode | ||
from .fontmetrics import FONT_METRICS | ||
from .pdftypes import PDFException | ||
from .pdftypes import PDFStream | ||
from .pdftypes import resolve1 | ||
from .pdftypes import dict_value | ||
from .pdftypes import int_value | ||
from .pdftypes import list_value | ||
|
@@ -33,7 +35,6 @@ | |
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
def get_widths(seq): | ||
widths = {} | ||
r = [] | ||
|
@@ -52,10 +53,6 @@ def get_widths(seq): | |
widths[i] = w | ||
r = [] | ||
return widths | ||
#assert get_widths([1]) == {} | ||
#assert get_widths([1,2,3]) == {1:3, 2:3} | ||
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8} | ||
|
||
|
||
def get_widths2(seq): | ||
widths = {} | ||
|
@@ -75,22 +72,15 @@ def get_widths2(seq): | |
widths[i] = (w, (vx, vy)) | ||
r = [] | ||
return widths | ||
#assert get_widths2([1]) == {} | ||
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))} | ||
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))} | ||
|
||
|
||
## FontMetricsDB | ||
## | ||
class FontMetricsDB(object): | ||
|
||
@classmethod | ||
def get_metrics(klass, fontname): | ||
return FONT_METRICS[fontname] | ||
|
||
|
||
## Type1FontHeaderParser | ||
## | ||
class Type1FontHeaderParser(PSStackParser): | ||
|
||
KEYWORD_BEGIN = KWD(b'begin') | ||
|
@@ -140,12 +130,18 @@ def do_keyword(self, pos, token): | |
|
||
|
||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') | ||
""" | ||
Note: DLIdent-* isn't found in PDF Reference but is been kept as | ||
it is harmless and have possibility of been a type. (induced from bug report/PR) | ||
""" | ||
IDENTITY_ENCODER = {'Identity-H':'Identity-H', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mentioned that DLIdent-* is not in the PDF reference manual. Did you find other pdf documentation that mentions this? If so, you could add a comment that refers to it such that we do not remove DLIdent-* by accident. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't seen any pdf with DLIdent-* but have included it due to its harmless nature. Will also include code comment to avoid DLIdent-* being removed by accident. |
||
'Identity-V':'Identity-V', | ||
'DLIdent-H':'Identity-H', | ||
'DLIdent-V':'Identity-V', | ||
'OneByteIdentityH':'OneByteIdentityH', | ||
'OneByteIdentityV':'OneByteIdentityV', | ||
} | ||
|
||
|
||
## CFFFont | ||
## (Format specified in Adobe Technical Note: #5176 | ||
## "The Compact Font Format Specification") | ||
## | ||
def getdict(data): | ||
d = {} | ||
fp = BytesIO(data) | ||
|
@@ -273,6 +269,7 @@ class CFFFont(object): | |
'Light', 'Medium', 'Regular', 'Roman', 'Semibold', | ||
) | ||
|
||
|
||
class INDEX(object): | ||
|
||
def __init__(self, fp): | ||
|
@@ -373,9 +370,6 @@ def __init__(self, name, fp): | |
assert False, str(('Unhandled', format)) | ||
else: | ||
raise ValueError('unsupported charset format: %r' % format) | ||
#print self.code2gid | ||
#print self.name2gid | ||
#assert 0 | ||
return | ||
|
||
def getstr(self, sid): | ||
|
@@ -384,8 +378,6 @@ def getstr(self, sid): | |
return self.string_index[sid-len(self.STANDARD_STRINGS)] | ||
|
||
|
||
## TrueTypeFont | ||
## | ||
class TrueTypeFont(object): | ||
|
||
class CMapNotFound(Exception): | ||
|
@@ -471,8 +463,6 @@ def create_unicode_map(self): | |
return unicode_map | ||
|
||
|
||
## Fonts | ||
## | ||
class PDFFontError(PDFException): | ||
pass | ||
|
||
|
@@ -484,7 +474,6 @@ class PDFUnicodeNotDefined(PDFFontError): | |
LITERAL_TYPE1C = LIT('Type1C') | ||
|
||
|
||
# PDFFont | ||
class PDFFont(object): | ||
|
||
def __init__(self, descriptor, widths, default_width=None): | ||
|
@@ -549,7 +538,6 @@ def string_width(self, s): | |
return sum(self.char_width(cid) for cid in self.decode(s)) | ||
|
||
|
||
# PDFSimpleFont | ||
class PDFSimpleFont(PDFFont): | ||
|
||
def __init__(self, descriptor, widths, spec): | ||
|
@@ -586,7 +574,6 @@ def to_unichr(self, cid): | |
raise PDFUnicodeNotDefined(None, cid) | ||
|
||
|
||
# PDFType1Font | ||
class PDFType1Font(PDFSimpleFont): | ||
|
||
def __init__(self, rsrcmgr, spec): | ||
|
@@ -618,14 +605,12 @@ def __repr__(self): | |
return '<PDFType1Font: basefont=%r>' % self.basefont | ||
|
||
|
||
# PDFTrueTypeFont | ||
class PDFTrueTypeFont(PDFType1Font): | ||
|
||
def __repr__(self): | ||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont | ||
|
||
|
||
# PDFType3Font | ||
class PDFType3Font(PDFSimpleFont): | ||
|
||
def __init__(self, rsrcmgr, spec): | ||
|
@@ -648,7 +633,6 @@ def __repr__(self): | |
return '<PDFType3Font>' | ||
|
||
|
||
# PDFCIDFont | ||
class PDFCIDFont(PDFFont): | ||
|
||
def __init__(self, rsrcmgr, spec, strict=settings.STRICT): | ||
|
@@ -661,18 +645,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): | |
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) | ||
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), | ||
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) | ||
try: | ||
name = literal_name(spec['Encoding']) | ||
except KeyError: | ||
if strict: | ||
raise PDFFontError('Encoding is unspecified') | ||
name = 'unknown' | ||
try: | ||
self.cmap = CMapDB.get_cmap(name) | ||
except CMapDB.CMapNotFound as e: | ||
if strict: | ||
raise PDFFontError(e) | ||
self.cmap = CMap() | ||
self.cmap = self.get_cmap_from_spec(spec, strict) | ||
|
||
try: | ||
descriptor = dict_value(spec['FontDescriptor']) | ||
except KeyError: | ||
|
@@ -719,6 +693,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): | |
PDFFont.__init__(self, descriptor, widths, default_width=default_width) | ||
return | ||
|
||
def get_cmap_from_spec(self, spec, strict): | ||
""" | ||
For certain PDFs, Encoding Type isn't mentioned as an attribute of | ||
Encoding but as an attribute of CMapName, where CMapName is an | ||
attribute of spec['Encoding']. | ||
The horizontal/vertical modes are mentioned with different name | ||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. | ||
""" | ||
try: | ||
spec_encoding = spec['Encoding'] | ||
if hasattr(spec_encoding, 'name'): | ||
cmap_name = literal_name(spec['Encoding']) | ||
else: | ||
cmap_name = literal_name(spec_encoding['CMapName']) | ||
except KeyError: | ||
if strict: | ||
raise PDFFontError('Encoding is unspecified') | ||
cmap_name = 'unknown' | ||
if type(cmap_name) is PDFStream: | ||
if 'CMapName' in cmap_name: | ||
cmap_name = cmap_name.get('CMapName').name | ||
else: | ||
if strict: | ||
raise PDFFontError('CMapName unspecified for encoding') | ||
cmap_name = 'unknown' | ||
if cmap_name in IDENTITY_ENCODER: | ||
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name]) | ||
else: | ||
return CMap() | ||
|
||
def __repr__(self): | ||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding) | ||
|
||
|
@@ -743,16 +747,14 @@ def to_unichr(self, cid): | |
except KeyError: | ||
raise PDFUnicodeNotDefined(self.cidcoding, cid) | ||
|
||
|
||
# main | ||
def main(argv): | ||
for fname in argv[1:]: | ||
fp = open(fname, 'rb') | ||
#font = TrueTypeFont(fname, fp) | ||
font = CFFFont(fname, fp) | ||
print (font) | ||
fp.close() | ||
return | ||
|
||
|
||
if __name__ == '__main__': | ||
sys.exit(main(sys.argv)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I might be nitpicking here: according to pep8 block comments should start with a hashtag instead of using a multi-line string.
Advantage of using hashtag: an intelligent editor can understand that it is a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, Done.