From a426d41b004cfc635eeeec96611794f50d7ce930 Mon Sep 17 00:00:00 2001 From: sl2c Date: Sat, 14 Dec 2024 14:53:42 +0400 Subject: [PATCH] v2.6 --- ccitt.py | 37 ++++-- glyph-lists/my-glyphlist.txt | 18 +++ pdffont.py | 213 +++++++++++++++++++++++++++-------- pdffontencoding.py | 43 ++++--- pdffontglyphmap.py | 2 +- pdfimage.py | 60 ++++++++-- pdfstreameditor.py | 24 ++-- 7 files changed, 302 insertions(+), 95 deletions(-) diff --git a/ccitt.py b/ccitt.py index f1a21f7..8b04184 100644 --- a/ccitt.py +++ b/ccitt.py @@ -65,7 +65,7 @@ class Group4Decoder(object): MAKEUP_LOW_BLACK_DECODE = {v:(k+1)*64 for k,v in enumerate(MAKEUP_LOW_BLACK_ENCODE)} MAKEUP_HIGH_ENCODE = [ - '00000001000','00000001100','00000001001','000000010010','000000010011','000000010100','000000010101','000000010110', + '00000001000','00000001100','00000001101','000000010010','000000010011','000000010100','000000010101','000000010110', '000000010111','000000011100','000000011101','000000011110','000000011111' ] MAKEUP_HIGH_DECODE = {v:(k + 28)*64 for k,v in enumerate(MAKEUP_HIGH_ENCODE)} @@ -83,17 +83,22 @@ def decode(self, data:bytes, Columns:int, EncodedByteAlign:bool = False): with ends of lines padded with 0-bits to whole bytes, if necessary. ''' - def dump(outBits:str, Columns:int, a0:int, line:int, message:str): + MODES = self.MODES_DECODE + WHITE, BLACK = 0, 1 + + peek = lambda i: inBits[inPos:inPos+i] + getBit = lambda color: '0' if color == WHITE else '1' + + def dump(inBits:str, outBits:str, Columns:int, a0:int, line:int, message:str): nBytes = (Columns + 7 ) // 8 if a0 < nBytes * 8: outBits += '0' * (nBytes * 8 - a0) from PIL import Image, ImageChops pil = Image.frombytes('1',(Columns, line+1), toBytes(outBits)) ImageChops.invert(pil).save('dump.tif') - raise ValueError(message + '\n' + 'salvaged parts of the image written to dump.tif') - - MODES = self.MODES_DECODE - WHITE, BLACK = 0, 1 + message += f'\nline = {line}, a0 = {a0}, peek = {peek(24)}' + message += '\nsalvaged parts of the image written to dump.tif' + raise ValueError(message) toBytes = lambda bits: b''.join(int(bits[i:i+8],2).to_bytes(1,'big') for i in range(0,len(bits),8)) @@ -101,8 +106,6 @@ def dump(outBits:str, Columns:int, a0:int, line:int, message:str): inBits = ''.join(f'{d:08b}' for d in data) inPos = 0 outBits = '' - peek = lambda i: inBits[inPos:inPos+i] - getBit = lambda color: '0' if color == WHITE else '1' b = [] a = [] @@ -132,8 +135,18 @@ def dump(outBits:str, Columns:int, a0:int, line:int, message:str): a0 = b2 elif l == 'HOR': # Horizontal mode - M01, inPos = self.get_run_length(inBits, inPos, color) - M12, inPos = self.get_run_length(inBits, inPos, color^1) + try: + M01, inPos = self.get_run_length(inBits, inPos, color) + except: + clr = 'white' if color == WHITE else 'black' + dump(inBits, outBits, Columns, a0, line, f'failed to get 1st ({clr}) run length') + + try: + M12, inPos = self.get_run_length(inBits, inPos, color^1) + except: + clr = 'black' if color == WHITE else 'white' + dump(inBits, outBits, Columns, a0, line, f'failed to get 2nd ({clr}) run length') + outBits += getBit(color)*M01 + getBit(color^1)*M12 a1, a2 = a0 + M01, a0 + M01 + M12 a.append(a1); a.append(a2) @@ -155,14 +168,14 @@ def dump(outBits:str, Columns:int, a0:int, line:int, message:str): if l is None: if peek(24) != self.EOFB: - dump(outBits, Columns, a0, line, f'unrecognized bits at line = {line}, a0 = {a0}: {peek(24)}') + dump(inBits, outBits, Columns, a0, line, f'unrecognized bits') if res := len(outBits) % 8: outBits += '0'*(8-res) return toBytes(outBits) if a0 > Columns: - dump(outBits, Columns, a0, line, f'extra bits at line = {line}, a0 = {a0}') + dump(inBits, outBits, Columns, a0, line, f'extra bits at the end of line') if a0 == Columns: a0 = -1 diff --git a/glyph-lists/my-glyphlist.txt b/glyph-lists/my-glyphlist.txt index 91e465b..edc8f4a 100644 --- a/glyph-lists/my-glyphlist.txt +++ b/glyph-lists/my-glyphlist.txt @@ -19,6 +19,8 @@ FL107;2014 Accent;0301 accent;0301 bot;22A5 +bar.one;2015 +bar.two;2015 check;2713 equaldotleftright;2252 nobreakspace;00A0 @@ -50,6 +52,7 @@ lessthanorequalangled;2A7D greaterthanorequalangled;2A7E integralex;23ae EurDig;20AC +zero.slash;0030 # ============================================== Numbers 0;0030 1;0031 @@ -873,20 +876,28 @@ VertBar1;007c AE;00C6 Complex;2102 Delta;2206 +Delta1;2206 Gamma;0393 +Gamma1;0393 Ifractur;2111 Lambda;039B +Lambda1;039B Natural;2115 OE;0152 Omega;2126 +Omega1;2126 Oslash;00D8 Phi;03A6 +Phi1;03A6 Pi;03A0 Psi;03A8 +Psi1;03A8 Real;211D Rfractur;211C Sigma;03A3 +Sigma1;03A3 Theta;0398 +Theta1;0398 Upsilon;03A5 Xi;039E Zinteger;2124 @@ -983,8 +994,11 @@ braceleftbig8;007B braceleftbig9;007B braceleftbigg;007B braceleftbt;23A9 +braceleft.bot;23A9 braceleftmid;23A8 +braceleft.mid;23A8 bracelefttp;23A7 +braceleft.top;23A7 braceright;007D bracerightBig;007D bracerightBigg;007D @@ -1000,8 +1014,11 @@ bracerightbig8;007D bracerightbig9;007D bracerightbigg;007D bracerightbt;23AD +braceright.bot;23AD bracerightmid;23AC +braceright.mid;23AC bracerighttp;23AB +braceright.top;23AB bracketleftBig;005B bracketleftBigg;005B bracketleftbig;005B @@ -1101,6 +1118,7 @@ endash;2013 epsilon1;1D716 #epsilon1;03B5 epsilon;03B5 +equal1;003D equivalence;2261 equivasymptotic;224D eta;03B7 diff --git a/pdffont.py b/pdffont.py index 0a4f5c0..4d4f05c 100755 --- a/pdffont.py +++ b/pdffont.py @@ -16,10 +16,11 @@ from pdfrwx.pdffontcmap import PdfFontCMap from pdfrwx.pdffilter import PdfFilter from pdfrwx.pdfgeometry import VEC, MAT, BOX +from pdfrwx.pdfobjects import PdfObjects # fontLib try: - from fontTools.ttLib import TTFont + from fontTools.ttLib import TTFont, newTable from fontTools.ttLib.tables._c_m_a_p import CmapSubtable from fontTools.t1Lib import T1Font, writePFB from fontTools.cffLib import CFFFontSet @@ -200,6 +201,16 @@ def is_symbolic(font:PdfDict): return isSymbolic + # -------------------------------------------------------------------------------- get_font_name() + + @staticmethod + def get_font_name(font:PdfDict): + ''' + Returns font's name: `font.Name` for a Type3 font and `font.BaseFont` for all others. + Note that a font may have no name, in which case `None` is returned. + ''' + return font.Name if font.Subtype == '/Type3' else font.BaseFont + # -------------------------------------------------------------------------------- get_font_descriptor() @staticmethod @@ -299,7 +310,17 @@ def get_type3_bbox(font:PdfDict): @staticmethod def get_subtype_string(font:PdfDict): ''' - String representation of the font's Subtype + Returns `subtype + suffix` where `subtype` is one of: + + `/Type1, /TrueType, /CIDFontType0, /CIDFontType2` + + and suffix is: + + - `'C'` if `FontFile3.Subtype` is `/Type1C` or `/CIDFontType0C`; + - `'-OpenType'` if `FontFile3.Subtype` is `/OpenType`; + - absent if `FontFile3` font program is absent. + + See PDF Ref. v1.7 sec. 5.8, table 5.23. ''' suffixes = {'/Type1C':'C', '/CIDFontType0C':'C', '/OpenType':'-OpenType'} @@ -307,13 +328,8 @@ def get_subtype_string(font:PdfDict): f = font if font.Subtype != '/Type0' else font.DescendantFonts[0] - prefix = f.Subtype - - fd = f.FontDescriptor - if not fd: return prefix - - suffix = suffixes.get(fd.FontFile3.Subtype) if fd.FontFile3 else '' - return prefix + suffix + try: return f.Subtype + suffixes.get(f.FontDescriptor.FontFile3.Subtype) + except: return f.Subtype # -------------------------------------------------------------------------------- get_encoding_string() @@ -393,8 +409,7 @@ def read_pfb_info(font:bytes): for char in chars.values(): char.draw(NullPen()) # Set the maps - # info['gid2gname'] = {chr(i):gname for i,gname in enumerate(chars.keys())} - info['gid2gname'] = {chr(i):gname for i,gname in enumerate(t1.font['Encoding']) if gname != '.notdef'} + info['gid2gname'] = {i:gname for i,gname in enumerate(t1.font['Encoding']) if gname != '.notdef'} info['gname2width'] = {gname:chars[gname].width for gname in chars.keys()} # Return info @@ -487,7 +502,7 @@ def read_ttf_otf_info(font:bytes): warn(f'failed to get glyphSet from font: {fontName}') if glyphSet is not None: - try: info['gid2gname'] = {chr(ttFont.getGlyphID(gname)):gname for gname in glyphSet} + try: info['gid2gname'] = {ttFont.getGlyphID(gname):gname for gname in glyphSet} except: warn(f'failed to get gid2gname from font: {fontName}') try: info['gname2width'] = {gname:glyphSet[gname].width * z for gname in glyphSet} except: warn(f'failed to get from gname2width font: {fontName}') @@ -500,7 +515,7 @@ def read_ttf_otf_info(font:bytes): if combo(table) == (1,0): info['cmap10'] = m if combo(table) == (3,1): info['cmap30'] = m - # info['unicode2gname'] = ttFont.getBestCmap() + info['unicode2gname'] = ttFont.getBestCmap() # if re.search('MBBIVI', info['FontName']): # ttFont.save('dump.ttf') @@ -510,6 +525,37 @@ def read_ttf_otf_info(font:bytes): # Return the result return info + # -------------------------------------------------------------------------------- merge_cff_font_sets() + + @staticmethod + def merge_cff_font_sets(cffFont1:CFFFontSet, cffFont2:CFFFontSet): + ''' + Attempts to merge two CFF fonts. If the fonts are compatible, cffFont2 is updated with entries + from cffFont1, and the function returns True; otherwise, the two fonts are unchanged and + the function returns False. + ''' + cs1 = cffFont1[0].CharStrings + cs2 = cffFont2[0].CharStrings + + for s in cs1.values(): s.compile() + for s in cs2.values(): s.compile() + + # check compatibility + if any(gname in cs2.keys() and cs1[gname].bytecode != cs2[gname].bytecode for gname in cs1.keys()): + return False + + # check non-zero overlap + if not any(gname in cs2.keys() for gname in cs1.keys()): + return False + + # update cffFont2 with entries from cffFont1 + for gname in cs1.keys(): + if gname not in cs2.keys(): + cs2[gname] = cs1[gname] + cffFont2[0].charset.append(gname) + + return True + # -------------------------------------------------------------------------------- read_cff_info() @staticmethod @@ -542,10 +588,10 @@ def read_cff_info(cff:bytes): # gid2gname if info['ROS']: - info['gid2gname'] = {chr(int(gname[3:]) if gname != '.notdef' else 0):gname for gname in chars.keys()} + info['gid2gname'] = {(int(gname[3:]) if gname != '.notdef' else 0):gname for gname in chars.keys()} else: try: - info['gid2gname'] = {chr(i):gname for i,gname in enumerate(font.Encoding) if gname != '.notdef'} + info['gid2gname'] = {i:gname for i,gname in enumerate(font.Encoding) if gname != '.notdef'} except: warn(f'CFF font has no CharStrings: {info["FontName"]}') info['gid2gname'] = {} @@ -589,7 +635,7 @@ def read_core14_info(fontName:str): baseEncodingName = PdfFontCore14.built_in_encoding(fontName) if baseEncodingName: baseEncoding = PdfFontEncoding(name = baseEncodingName) - info['gid2gname'] = {cc:gname[1:] for cc,gname in baseEncoding.cc2glyphname.items()} + info['gid2gname'] = {ord(cc):gname[1:] for cc,gname in baseEncoding.cc2glyphname.items()} # gname2width name2width = PdfFontCore14.make_name2width(fontName) @@ -629,12 +675,15 @@ def fix_ttf_font(font:bytes): # -------------------------------------------------------------------------------- print_xml() @staticmethod - def print_xml(cff:CFFFontSet): + def cff_to_xml(cff:bytes, filePath:str): ''' + Writes a CFF font program to file. ''' from fontTools.misc.xmlWriter import XMLWriter - print('***** toXML()', cff.toXML(XMLWriter('__debug.xml'))) - + ttFont = TTFont() + cffFont = CFFFontSet() + cffFont.decompile(file = BytesIO(cff), otFont = ttFont) + cffFont.toXML(XMLWriter(filePath)) # =========================================================================== class PdfFont @@ -706,7 +755,7 @@ def __init__(self, self.extract_font_program() - self.info = self.read_pfb_info() if self.pfb \ + self.info = PdfFontFile.read_pfb_info(self.pfb) if self.pfb \ else PdfFontFile.read_ttf_otf_info(self.ttf or self.otf) if (self.ttf or self.otf) \ else PdfFontFile.read_cff_info(self.cff) if self.cff \ else PdfFontFile.read_core14_info(self.get_font_name()) @@ -731,7 +780,7 @@ def __init__(self, # Set font's bounding box if self.font.Subtype == '/Type3': - self.bbox = self.get_type3_bbox() + self.bbox = PdfFontDictFunc.get_type3_bbox(self.font) else: try: self.bbox = BOX(self.get_font_descriptor().FontBBox) except: self.bbox = BOX([0, 0, 1000, 1000]) @@ -798,7 +847,7 @@ def make_encoding(self, cc2gname:dict = None, cc2unicode:dict = None): cc2gname = {cc:unicode2gname[ord(u)] for cc,u in cc2unicode.items() if ord(u) in unicode2gname} if not cc2gname: - cc2gname = self.info['gid2gname'] + cc2gname = {chr(gid):gname for gid, gname in self.info['gid2gname'].items()} encoding = PdfFontEncoding() @@ -826,26 +875,28 @@ def get_cid_encoding_from_type0_font(self): gid2gname = self.info.get('gid2gname') if not gid2gname: return None - # Get cid2gid - cid2gid = self.get_cidtogidmap() + # Get cid2gid (str -> str) + cid2gid = PdfFontDictFunc.get_cidtogidmap(self.font) if not cid2gid: cid2gid = {cc:cc for cc in self.get_cc2width_from_font()} # Append cids from CIDSet if it exists - CIDSet = self.get_cidset() + CIDSet = PdfFontDictFunc.get_cidset(self.font) if CIDSet: for cid in CIDSet: if cid not in cid2gid: cid2gid[cid] = cid - # msg(f'{self.get_font_name()}: cc2width --> encoding') + if len(cid2gid) == 0: + name = self.font.BaseFont or self.DescendantFonts[0].BaseFont + cid2gid = {chr(gid):chr(gid) for gid in gid2gname} # Create cid2gname - cid2gname = {cid:gid2gname.get(gid,'.notdef') for cid,gid in sorted(cid2gid.items())} + cc2gname = {cid:gid2gname.get(ord(gid),'.notdef') for cid,gid in sorted(cid2gid.items())} # Create encoding encoding = PdfFontEncoding() encoding.name = PdfName('CIDEncoding') - encoding.cc2glyphname = {cc:PdfName(gname) for cc,gname in cid2gname.items()} + encoding.cc2glyphname = {cc:PdfName(gname) for cc,gname in cc2gname.items()} encoding.reset_glyphname2cc() return encoding @@ -865,11 +916,14 @@ def get_cid2unicode(self, encoding:PdfFontEncoding = None): # -------------------------------------------------------------------------------- get_cid2width() - def get_cid2width(self, encoding:PdfFontEncoding = None): + def get_cid2width_from_info(self, encoding:PdfFontEncoding = None): ''' + Get a map form CIDs to widths (int -> float) from self.info ''' - cid2gname = {cid:gname[1:] for cid,gname in encoding.cc2glyphname.items()} if encoding \ - else {gid:gid for gid in self.info['gid2gname'].items()} + if encoding: + cid2gname = {ord(cc):gname[1:] for cc,gname in encoding.cc2glyphname.items()} + else: + cid2gname = self.info['gid2gname'] gname2width = self.info['gname2width'] return {cid:gname2width.get(gname,0) for cid,gname in cid2gname.items()} @@ -877,9 +931,9 @@ def get_cid2width(self, encoding:PdfFontEncoding = None): def make_font_dict(self, encoding:PdfFontEncoding = None, force_CID:bool = False): - # ................................................................................ get_flags() + # ................................................................................ make_flags() - def get_flags(info:dict): + def make_flags(info:dict): ''' Calculates the `Flags` bit field. ''' @@ -957,7 +1011,7 @@ def get_flags(info:dict): elif self.ttf: # Make FontFile2 - fontProgram = PdfFont.fix_ttf_font(self.ttf) + fontProgram = PdfFontFile.fix_ttf_font(self.ttf) FontFile2 = IndirectPdfDict( Length1 = len(fontProgram), @@ -984,7 +1038,7 @@ def get_flags(info:dict): FontDescriptor = IndirectPdfDict( Type = PdfName('FontDescriptor'), FontName = PdfName(FontName), - Flags = get_flags(self.info), + Flags = make_flags(self.info), # FontBBox = PdfArray(self.info['FontBBox']), FontBBox = PdfArray(bbox), ItalicAngle = self.info['ItalicAngle'], @@ -1019,14 +1073,14 @@ def get_flags(info:dict): CIDToGIDMap = None else: gname2gid = {gname:gid for gid,gname in self.info['gid2gname'].items()} - cid2gid = {cid:gname2gid.get(gname[1:],0) for cid,gname in encoding.cc2glyphname.items()} - gids = [ord(cid2gid.get(chr(cid),chr(0))) for cid in range(maxCID + 1)] + cid2gid = {ord(cc):gname2gid.get(gname[1:],0) for cc,gname in encoding.cc2glyphname.items()} + gids = [cid2gid.get(cid,0) for cid in range(maxCID + 1)] CIDToGIDMap = IndirectPdfDict( stream=py23_diffs.convert_load(b''.join(bytes([gid >> 8, gid & 255]) for gid in gids)) ) # Widths - W = PdfArray([x for cid,w in self.get_cid2width(encoding).items() for x in [ord(cid),PdfArray([w])]]) + W = PdfArray([x for cid,w in self.get_cid2width_from_info(encoding).items() for x in [cid,PdfArray([w])]]) # CIDFontSubtype; see PDF Ref. Sec. 5.8, Table 5.23 Embedded font organization for various font types CIDFontSubtype = PdfName('CIDFontType2') if otfWithGlyf or self.ttf else PdfName('CIDFontType0') @@ -1071,7 +1125,7 @@ def get_flags(info:dict): Differences, FirstChar, LastChar = PdfFontEncoding.cc2glyphname_to_differences(encoding.cc2glyphname) - Widths = PdfArray([self.get_cid2width(encoding).get(chr(i),0) for i in range(FirstChar, LastChar+1)]) + Widths = PdfArray([self.get_cid2width_from_info(encoding).get(i,0) for i in range(FirstChar, LastChar+1)]) fontDict = IndirectPdfDict( Type = PdfName('Font'), @@ -1134,18 +1188,46 @@ def extract_font_program(self): self.otf = FontProgram else: raise ValueError(f'invalid FontFile3.Subtype: {subtype}') + + + # -------------------------------------------------------------------------------- save() + + def save(self, basePath:str = None): + ''' + The basePath arguments is is the intended path to the saved font file without the file's extension. + If it's None then the self.name is chosen as the file name, and the file is saved in the current folder. + ''' + fontProgram = self.pfb or self.ttf or self.otf or self.cff + if fontProgram is None: + raise ValueError(f'no font program in font: {self.name}') + if basePath is None: + basePath = self.name[1:] + ext = 'pfb' if self.pfb else 'ttf' if self.ttf else 'otf' if self.otf else 'cff' if self.cff else None + + # if ext == 'cff': + # PdfFontFile.cff_to_xml(self.cff, basePath + '.xml') + # return + + open(basePath + '.' + ext, 'wb').write(fontProgram) + return # -------------------------------------------------------------------------------- get_font_name() def get_font_name(self): ''' + This is a wrapper function around `PdfFontDictFunc.get_font_name()` which does some extra work: + + * if `self.font` doesn't have a name, this function will return `f'/T3Font{N}'`, where `N` + is a consecutive integer (note: Type3 font type is the only one that is allowed not to have a name); + * if the name of `self.font` has been encountered in previous calls to this function, it will + return a "versioned" variant of the name `f'{name}-v{N}'`, where `N` is a consecutive integer. + + Altogether, this ensures that names returned for different fonts never coincide. ''' nDuplicates = lambda d, k: d.get(k) or d.update({k:len(d)+1}) or len(d) # duplicates counter f = self.font - result = f.Name if f.Subtype == '/Type3' \ - else f.DescendantFonts[0].BaseFont if f.Subtype == '/Type0' and f.DescendantFonts != None \ - else f.BaseFont + result = PdfFontDictFunc.get_font_name(f) if result not in self.__fontNameCache: self.__fontNameCache[result] = {} idx = nDuplicates(self.__fontNameCache[result], id(f)) @@ -1294,7 +1376,7 @@ def get_cc2width_from_font(self): gid2gname = self.info.get('gid2gname') cc2width = {cc:ww for cc,ww in cc2width.items() \ if gname2width and self.encoding.cc2glyphname.get(cc, '/None')[1:] in gname2width \ - or gid2gname and cc in gid2gname \ + or gid2gname and ord(cc) in gid2gname \ or gid2gname is None and gname2width is None} else: # CID fonts @@ -1328,9 +1410,10 @@ def get_cc2width_from_font(self): for cc in cidset: if cc not in cc2width: cc2width[cc] = defaultWidth - # elif gid2gname := self.info.get('gid2gname'): - # for cc in gid2gname: - # if cc not in cc2width: cc2width[cc] = defaultWidth + if len(cc2width) == 0: + gid2gname = self.info.get('gid2gname') + if gid2gname: + cc2width = {chr(gid):defaultWidth for gid in gid2gname} # Rescale from font units to document units @@ -1627,6 +1710,44 @@ def loadFont(self, fontNames:list[str], dirList:list[str], forceCID = False): self.fontsCache[nameFound] = font return font +# ------------------------------------------------------- get_object_fonts() + +def get_object_fonts(xobj:PdfDict, + fontTypes:list[str] = None, + regex:str = None): + ''' + Returns a `{id(font):font}` dictionary of fonts used by `xobj` whose Subtypes match those in the `fontTypes` list. + Setting `fontTypes = None` has the same effect as setting it to: + + `['/Type1', '/MMType1', '/TrueType', '/Type3', '/Type0']`. + + Example: + + `get_object_fonts(pdf, fontTypes = ['/Type3']).` + + If the `regex` argument is provided only the fonts whose names match the regex are selected. Fonts + that have no name are always included. + ''' + cache = set() + + fontName = PdfFontDictFunc.get_font_name + + if fontTypes is None: + fontTypes = ['/Type1', '/MMType1', '/TrueType', '/Type3', '/Type0'] + + fontFilter = lambda obj: \ + isinstance(obj, PdfDict) \ + and obj.Type == PdfName.Font \ + and obj.Subtype in fontTypes \ + and (regex in [None, '.'] or fontName(obj) is not None and re.search(regex, fontName(obj), re.IGNORECASE)) + + if xobj.pages: + objectTuples = [t for page in xobj.pages for t in PdfObjects(page, cache=cache)] + else: + objectTuples = PdfObjects(xobj, cache=cache) + + return {id(obj):obj for name, obj in objectTuples if fontFilter(obj)} + # ============================================================================= main() if __name__ == '__main__': diff --git a/pdffontencoding.py b/pdffontencoding.py index c24a278..6000b33 100755 --- a/pdffontencoding.py +++ b/pdffontencoding.py @@ -28,29 +28,31 @@ def __init__(self, self.baseEncoding = None gid2gname = fontInfo.get('gid2gname') if fontInfo else None + cmap10 = fontInfo.get('cmap10') if fontInfo else None + cmap30 = fontInfo.get('cmap30') if fontInfo else None # self.baseEncoding = '/WinAnsiEncoding' self.cc2glyphname = {} # A map from character codes (chars) to glyph names self.glyphname2cc = {} # A reverse map from glyph names to character codes (chars) + cc2g = {} if name != None: self.name = name - self.cc2glyphname = PdfFontEncodingStandards.get_cc2glyphname(name) - if self.cc2glyphname == None: err(f'invalid encoding name: {name}') + cc2g = PdfFontEncodingStandards.get_cc2glyphname(name) + if cc2g == None: err(f'invalid encoding name: {name}') if differences != None: self.name = '[/Differences]' - self.cc2glyphname = PdfFontEncoding.differences_to_cc2glyphname(differences) + cc2g = PdfFontEncoding.differences_to_cc2glyphname(differences) if font != None: assert font.Subtype != '/Type0' # SIMPLE font self.isType3 = font.Subtype == '/Type3' - self.cc2glyphname = {} fd = font.FontDescriptor isEmbedded = fd and (fd.FontFile or fd.FontFile2 or fd.FontFile3) @@ -74,9 +76,9 @@ def __init__(self, else None if self.baseEncoding: - self.cc2glyphname = PdfFontEncodingStandards.get_cc2glyphname(self.baseEncoding) + cc2g = PdfFontEncodingStandards.get_cc2glyphname(self.baseEncoding) elif gid2gname: - self.cc2glyphname = {gid:PdfName(gname) for gid,gname in gid2gname.items()} + cc2g = {chr(gid):PdfName(gname) for gid,gname in gid2gname.items()} # PDF Ref. v1.7 sec. 5.5.5: # If the Encoding entry is a dictionary, the table is initialized with the entries from the @@ -88,7 +90,7 @@ def __init__(self, if font.Encoding.Differences != None: differencesMap = PdfFontEncoding.differences_to_cc2glyphname(font.Encoding.Differences) - self.cc2glyphname = self.cc2glyphname | differencesMap + cc2g = cc2g | differencesMap self.name = [font.Encoding.BaseEncoding, '/Differences' if font.Encoding.Differences != None else None] @@ -96,22 +98,29 @@ def __init__(self, self.name = font.Encoding if font.Encoding \ else PdfFontCore14.built_in_encoding(font.BaseFont) if not isEmbedded \ - else '/TrueType' if font.Subtype == '/TrueType' \ + else '/Built-In' if font.Subtype == '/TrueType' \ else None - if self.name == '/TrueType': - if cmap30 := fontInfo.get('cmap30'): - self.cc2glyphname = {cc:PdfName(gname) for cc,gname in cmap30.items()} - elif cmap10 := fontInfo.get('cmap10'): - self.cc2glyphname = {cc:PdfName(gname) for cc,gname in cmap10.items()} + if self.name == '/Built-In': + # PDF Ref. v1.7 p.432 + if len(fontInfo) == 0: + # In case font program extraction failed or was not requested + pass + elif cmap30: + high = [(ord(cc)>>8) for cc in cmap30] + if not any(all(h==a for h in high) for a in [0x00, 0xf0, 0xf1, 0xf2]): + raise ValueError(f"failed (3,0) cmap range check in a TrueType font: {font.BaseFont}") + cc2g = {chr(ord(cc) & 0xff):PdfName(gname) for cc,gname in cmap30.items()} + elif cmap10: + cc2g = {cc:PdfName(gname) for cc,gname in cmap10.items()} else: - raise ValueError(f'no built-in encoding in a TrueType font: {font.BaseFont}') + raise ValueError(f'no (3,0) or (1,0) cmap in a TrueType font: {font.BaseFont}') elif self.name: - self.cc2glyphname = PdfFontEncodingStandards.get_cc2glyphname(self.name) + cc2g = PdfFontEncodingStandards.get_cc2glyphname(self.name) elif gid2gname: - self.cc2glyphname = {gid:PdfName(gname) for gid,gname in gid2gname.items()} + cc2g = {chr(gid):PdfName(gname) for gid,gname in gid2gname.items()} - # reset self.glyphname2cc + self.cc2glyphname = cc2g self.reset_glyphname2cc() # -------------------------------------------------------------------------------- differences_to_cc2glyphname() diff --git a/pdffontglyphmap.py b/pdffontglyphmap.py index 9e44557..902312b 100755 --- a/pdffontglyphmap.py +++ b/pdffontglyphmap.py @@ -247,7 +247,7 @@ def reencode_cmap(self, cmap:PdfFontCMap, encoding:PdfFontEncoding, direct=False def strip_dot_endings(s:str): '''Strips ._, .sc & .cap endings from a string; useful to match variants (small caps etc) of glyph names in glyph lists ''' - s1 = re.sub(r'(\.(_|sc|cap|alt[0-9]*|disp|big|small|ts1|lf|swash))+$','', s) + s1 = re.sub(r'(\.(_|sc|cap|alt[0-9]*|vsize[0-9]*|hsize[0-9]*|disp|big|small|ts1|lf|tf|swash))+$','', s) s1 = re.sub(r'\\rm', '', s1) return s1 if len(s1)>1 else s diff --git a/pdfimage.py b/pdfimage.py index 8313164..debdf68 100755 --- a/pdfimage.py +++ b/pdfimage.py @@ -166,20 +166,34 @@ def create_profile(cs:CS_TYPE): ctx = lc.cmsCreateContext(None, None) white = lc.cmsCIExyY() white.x, white.y, white.Y = x, y, Y + if name == '/Lab': + profile = lc.cmsCreateLab2Profile(white) + if name == '/CalGray': + gamma = float(dic.Gamma) if dic.Gamma != None else 1 transferFunction = lc.cmsBuildGamma(ctx,gamma) profile = lc.cmsCreateGrayProfile(white, transferFunction) + if name == '/CalRGB': + primaries = [float(v) for v in dic.Matrix] if dic.Matrix != None else [1,0,0,0,1,0,0,0,1] - primaries = [primaries[3*i,3*i+3] for i in range(3)] + primaries = [primaries[3*i:3*i+3] for i in range(3)] primaries = [[X/(X+Y+Z),Y/(X+Y+Z),Y] for X,Y,Z in primaries] gamma = [float(v) for v in dic.Gamma] if dic.Gamma != None else [1,1,1] transferFunction = [lc.cmsBuildGamma(ctx, g) for g in gamma] - profile = lc.cmsCreateRGBProfile(white, primaries, transferFunction) + + p = lc.cmsCIExyYTRIPLE() + pt.Red = lc.cmsCIExyY(); pt.Red.x, pt.Red.y, pt.Red.Y = primaries[0] + pt.Green = lc.cmsCIExyY(); pt.Green.x, pt.Green.y, pt.Green.Y = primaries[1] + pt.Blue = lc.cmsCIExyY(); pt.Blue.x, pt.Blue.y, pt.Blue.Y = primaries[2] + + profile = lc.cmsCreateRGBProfile(white, pt, transferFunction) + with tempfile.TemporaryDirectory() as tmp: + T = lambda fileName: os.path.join(tmp, fileName) lc.cmsSaveProfileToFile(profile, T('profile.cms')) icc_profile = open(T('profile.cms'),'rb').read() @@ -454,10 +468,13 @@ def toBytes(s:str): return s.encode('Latin-1') width, height = int(obj.Width), int(obj.Height) cpp = self.get_cpp() - bpc_implied = (len(stream) * 8) / (width * height * cpp) if len(stream) > 1 else 1 - if bpc_implied != self.bpc and int(bpc_implied) == bpc_implied: - warn(f'replacing bad image xobject\'s bpc = {self.bpc} with the implied bpc = {int(bpc_implied)}') - self.bpc = int(bpc_implied) + # Sometimes /BitsPerComponent is incorrect + bytesPerLine = len(stream) / height + if bytesPerLine == int(bytesPerLine): + bpc_implied = (bytesPerLine * 8) / (width * cpp) if len(stream) > 1 else 1 + if bpc_implied != self.bpc and int(bpc_implied) == bpc_implied: + warn(f'replacing bad image xobject\'s bpc = {self.bpc} with the implied bpc = {int(bpc_implied)}') + self.bpc = int(bpc_implied) array = PdfFilter.unpack_pixels(stream, width, cpp, self.bpc, truncate = True) if array.shape[0] > height: @@ -614,8 +631,9 @@ def render(self, pdfPage:PdfDict = None, debug:bool = False): # Default colorspace try: cs2cs = {'/DeviceGray':'/DefaultGray', '/DeviceRGB':'/DefaultRGB', '/DeviceCMYK':'/DefaultCMYK'} - cs = pdfPage.Resources.ColorSpace[cs2cs[self.ColorSpace]] - if debug: msg(f'Page default colorspace: {PdfColorSpace.toStr(self.ColorSpace)} --> {PdfColorSpace.toStr(cs)}') + cs = pdfPage.Resources.ColorSpace[cs2cs[self.ColorSpace]] or self.ColorSpace + if cs != None and debug: + msg(f'Page default colorspace: {PdfColorSpace.toStr(self.ColorSpace)} --> {PdfColorSpace.toStr(cs)}') except: cs = self.ColorSpace @@ -1067,6 +1085,10 @@ def print_size_change(size_old, size_new): elif Format == 'JPEG2000': + if image.get_mode() not in ['L', 'RGB']: + msg(f'mode {image.get_mode()} not supported by JPEG2000; skipping') + return + assert Q or CR if Q: assert 0 < Q <= 100 if CR: assert CR > 1 @@ -1388,15 +1410,28 @@ def _jp2_write(array:np.ndarray, alpha:np.ndarray, cs:CS_TYPE, Q:int, CR:float): ''' Encode array as a JPEG2000 image. ''' - assert CR or Q - CR = int(round(CR)) if CR != None else int(round(2 ** ((100 - Q)/10.0))) + if alpha is not None: + raise ValueError(f'JPEG2000 encoding with alpha-channel is not implemented') + + # make sure colorspace is either grayscale or RGB cpp = PdfColorSpace.get_cpp(cs) if cpp not in [1,3]: raise ValueError(f'Jp2k: cpp = {cpp} not supported (must be 1 or 3)') + + # determine compression ratio + assert CR or Q + CR = int(round(CR)) if CR != None else int(round(2 ** ((100 - Q)/10.0))) + + # limit the number of resolutions for small images + numres = 6 # the default for large images + while 1 << numres > min(array.shape[:2]): + numres -= 1 + + # encode with tempfile.TemporaryDirectory() as tmp: T = lambda fileName: os.path.join(tmp, fileName) # can also try: cratios=[CR*4, CR*2, CR] - Jp2k(T('encoded.jp2'), array, colorspace='RGB' if cpp == 3 else 'Gray', cratios=[CR]) + Jp2k(T('encoded.jp2'), array, colorspace='RGB' if cpp == 3 else 'Gray', cratios=[CR], numres=numres) return open(T('encoded.jp2'), 'rb').read() @@ -1756,7 +1791,7 @@ def modify_image_xobject(image_obj:IndirectPdfDict, pdfPage:PdfDict, options:Pdf modified = True if options.colorspace: - image.render(pdfPage = pdfPage) + image.render(pdfPage = pdfPage, debug = options.debug) mode = {'cmyk':'CMYK', 'rgb':'RGB', 'gray':'L', 'grey':'L'}.get(options.colorspace.lower()) msg(f'converting colorspace: {image.get_mode()} --> {mode}') modified = image.change_mode(mode, intent) @@ -1822,6 +1857,7 @@ def getPageRange(s:str): ap.add_argument('-output', '-o', type=str, metavar='PATH', help='output PDF file path') ap.add_argument('-pages', type=str, metavar='RANGE', help='process selected pages; RANGE = N1[,N2-N3[,..]]') ap.add_argument('-dpi', type=float, metavar='N', help='set resolution of input images to DPI') + ap.add_argument('-debug', action='store_true', help='turns debugging on') ap.add_argument('-bitonal', action='store_true', help='convert color/gray images to bitonal using Otsu\'s algorithm') ap.add_argument('-auto', action='store_true', help='detect if gray/color images are in fact bitonal and convert them') diff --git a/pdfstreameditor.py b/pdfstreameditor.py index 23ad31c..f6314ed 100644 --- a/pdfstreameditor.py +++ b/pdfstreameditor.py @@ -35,7 +35,9 @@ class PdfStreamEditor: - def __init__(self, xobj:PdfDict, glyphMap:PdfFontGlyphMap, + def __init__(self, + xobj:PdfDict, + glyphMap:PdfFontGlyphMap = PdfFontGlyphMap(), textOnly:bool=False, graphicsOnly:bool=False, normalize=False, @@ -87,8 +89,8 @@ def normalize_text_operators(self, tree:list, state:PdfState = None) -> list: ''' if state == None: state = PdfState(self.xobj.inheritable.Resources, self.glyphMap, - extractFontProgram=self.extractFontProgram, - makeSyntheticCmap=self.makeSyntheticCmap) + extractFontProgram=False, + makeSyntheticCmap=False) result = [] for leaf in tree: cmd, args = leaf[0], leaf[1] @@ -216,8 +218,14 @@ def recurse(self, recursedFunction:Callable, xobjCache, *args, **kwarg): for x in xobjects: if id(x) not in xobjCache and x.Subtype == PdfName.Form and x.stream != None: # Creates an editor of the same type as that of any inheriting class - editor = type(self)(x, self.glyphMap, textOnly = self.textOnly, graphicsOnly=self.graphicsOnly, - normalize = self.normalize, debug = self.debug) + editor = type(self)(xobj = x, + glyphMap = self.glyphMap, + textOnly = self.textOnly, + graphicsOnly = self.graphicsOnly, + normalize = self.normalize, + debug = self.debug, + extractFontProgram = self.extractFontProgram, + makeSyntheticCmap = self.makeSyntheticCmap) xobjCache[id(x)] = editor.recurse(recursedFunction, xobjCache, *args, **kwarg) if editor.isModified: self.isModified = True @@ -242,8 +250,8 @@ def flattenImages(self, dpi:float = 300): # This state is local to this function state = PdfState(resources = self.xobj.inheritable.Resources, glyphMap = self.glyphMap, - extractFontProgram = self.extractFontProgram, - makeSyntheticCmap = self.makeSyntheticCmap) + extractFontProgram = False, + makeSyntheticCmap = False) res = self.xobj.inheritable.Resources @@ -493,6 +501,8 @@ def processTextFunction(self, xobjCache:dict, tree:list=None, options:dict = {}) btText = PdfStreamEditor.chunks_to_text(btChunks) discardText = (regex != '' and re.search(regex,btText) != None) + + discardOCR = False if not removeOCR else \ any(len(kid[1])>0 and (kid[0],kid[1][0]) == ('Tr','3') for kid in leaf[2])