Skip to content

Commit

Permalink
Bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sl2c committed Nov 1, 2023
1 parent 5037291 commit 098ab5b
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 21 deletions.
10 changes: 7 additions & 3 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,22 @@ def err(msg):
the_func = stack[1][0].f_code.co_name
# the_func = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
lineno = inspect.getouterframes(inspect.currentframe(), 2)[1][2]
eprint(f'Error in {the_class}.{the_func}(), line {lineno}: {msg}')
eprint(f'{the_class}.{the_func}(): error in line {lineno}: {msg}')
sys.exit(1)

def msg(msg):
'''Prints a warning message in the form: 'func(): warning: msg', where func() is the function that called warn().'''
stack = inspect.stack()
the_class = stack[1][0].f_locals["self"].__class__.__name__ if "self" in stack[1][0].f_locals else 'global'
callerName = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
eprint(f'{callerName}(): {msg}')
eprint(f'{the_class}.{callerName}(): {msg}')

def warn(msg):
'''Prints a warning message in the form: 'func(): warning: msg', where func() is the function that called warn().'''
stack = inspect.stack()
the_class = stack[1][0].f_locals["self"].__class__.__name__ if "self" in stack[1][0].f_locals else 'global'
callerName = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
eprint(f'{callerName}(): warning: {msg}')
eprint(f'{the_class}.{callerName}(): warning: {msg}')

def er(msg):
'''Prints a message in the form 'Error: msg' to stderr, the exits.'''
Expand Down
2 changes: 1 addition & 1 deletion pdffilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def uncompress(obj:IndirectPdfDict):
elif filter in ['/FlateDecode', '/Fl', '/LZWDecode', '/LZW']:

# PDF Ref. 1.7 Sec. 3.3.3
if filter == '/FlateDecode':
if filter in ['/FlateDecode','/Fl']:
stream = zlib.decompress(stream)
else:
earlyChange = int(get_key(parm, '/EarlyChange', '1'))
Expand Down
10 changes: 5 additions & 5 deletions pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,10 @@ def __init__(self, fontDict:PdfDict, glyphMap:PdfFontGlyphMap = PdfFontGlyphMap(
except: self.bbox = [0, 0, 1000, 1000]

# Set cmap
try: # /ToUnicode may be junk
self.cmap = PdfFontCMap(toUnicodeDict = self.font.ToUnicode) if self.font.ToUnicode != None else None
except:
self.cmap = None
if self.font.ToUnicode != None:
try: self.cmap = PdfFontCMap(toUnicodeDict = self.font.ToUnicode) # /ToUnicode may be junk
except: self.cmap = None
else: self.cmap = None

if self.cmap == None:
if self.is_cid():
Expand Down Expand Up @@ -413,7 +413,7 @@ def make_cc2width(self):
font = self.font
if font == None: return None

if font.Subtype in ['/Type1','/Type3','/TrueType']:
if font.Subtype in ['/Type1', '/MMType1', '/Type3', '/TrueType']:

# Set cc2width
if font.Widths == None:
Expand Down
3 changes: 2 additions & 1 deletion pdffontcmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@ def read_to_unicode_dict(self, ToUnicode:IndirectPdfDict):
if stream == None: warn(f'no stream in font\'s ToUnicode object: {ToUnicode}') ; self.set_to_identity_map() ; return

if ToUnicode.Filter != None:
# stream = PdfFilter.uncompress(ToUnicode).stream
try:
stream = PdfFilter.uncompress(ToUnicode).stream
except:
warn(f'failed to decompress ToUnicode CMap: {ToUnicode}')
warn(f"failed to decompress the font's ToUnicode CMap: {ToUnicode}")
return
self.cc2unicode = {}

Expand Down
4 changes: 2 additions & 2 deletions pdffontglyphmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ def __init__(self, glyphListPaths:list[str] = [], fonts:PdfObjects = {}, knownPr

def composite_glyphname_to_unicode(self, gname:str):
''' For a glyph name of the composite 'prefix + number' form, e.g. 'c13', 'glyph10H', etc.,
where prefix is of the form: '[a-zA-Z]|#|FLW|uni|glyph|MT|.*\.g' and suffix is a DEX (decimal/hex)
where prefix is of the form: '[a-zA-Z]|#|FLW|uni|Char|glyph|MT|.*\.g' and suffix is a DEX (decimal/hex)
number: '[0-9a-fA-F]+' returns the number part as int by interpreting the corresponding string
part as a hex or dec number based on the statistics of previous encounters with the glyph names
of this form. The usage scenario is to first run this function through all available glyph names
to train the algorithm, and then call it on any particular glyph name to get results.
'''
suffix_type = lambda suffix: self.DEX if all(c in string.digits for c in suffix) else self.HEX

gname_marked = re.sub(r'^([a-zA-Z]|#|FLW|uni|glyph|MT|.*\.g)([0-9a-fA-F]+)$',r'\1|||\2',gname)
gname_marked = re.sub(r'^([a-zA-Z]|#|FLW|uni|Char|glyph|MT|.*\.g)([0-9a-fA-F]+)$',r'\1|||\2',gname)
gname_split = re.split(r'\|\|\|',gname_marked)
prefix,suffix = gname_split if len(gname_split) == 2 else (None,None)
if prefix == None: return None
Expand Down
13 changes: 8 additions & 5 deletions pdfimage.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def decode(obj:IndirectPdfDict, pdfPage:PdfDict = None, adjustColors = True, app

width, height = int(obj.Width), int(obj.Height)
bpc, cs = PdfImage.get_image_specs(obj)
# msg(f'image specs: {bpc}, {cs}')
msg(f'image specs: {bpc}, {cs}')

img = None

Expand Down Expand Up @@ -376,7 +376,9 @@ def decode(obj:IndirectPdfDict, pdfPage:PdfDict = None, adjustColors = True, app
elif filter == '/JPXDecode': # --> JPEG 2000

# msg('/JPXDecode --> JPEG2000')
img = Image.open(BytesIO(stream))
warn(f'/JPXDecode implementation is buggy at the moment, decoding is not attempted')
# img = Image.open(BytesIO(stream))
return None

else:
warn(f'unsupported stream filter: {filter}')
Expand Down Expand Up @@ -545,7 +547,7 @@ def decode(obj:IndirectPdfDict, pdfPage:PdfDict = None, adjustColors = True, app
# Restore icc_profile
img.info['icc_profile'] = icc_profile

if img.mode != 'RGB':
if img.mode != 'RGB':
intent = obj.Intent if applyIntent else None
# msg(f"Converting {img.mode} to sRGB with rendering intent: {intent}")
img = ImageUtils.pil_image_to_srgb(img, intent)
Expand Down Expand Up @@ -573,8 +575,9 @@ def get_image_specs(obj:PdfDict):

bpc = int(obj.BitsPerComponent)

if bpc == 1:
return 1, PdfColorSpace(mode = '1', cpp = 1)
# This is wrong if the CS is /Indexed (yes, there are indexed bitonal images)
# if bpc == 1:
# return 1, PdfColorSpace(mode = '1', cpp = 1)

return bpc, PdfColorSpace(obj.ColorSpace)

Expand Down
1 change: 1 addition & 0 deletions pdfstate.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def update(self, cmd, args):
# don't interpret the last displacement as space since it may be followed a negative displacement
# in TD etc; instead, just ignore it, but include it in the width calculation (next line)
# this way it will contribute to cs.Tm and will be interpreted as space, if necessary, later
# NB: the 0.25 factor is totally ad hoc and needs to be tested! See also same issue in self._get_gap()
zTight = z if len(z) == 0 or isinstance(z[-1],str) else z[:-1]
textString = ''.join(cs.font.decodeCodeString(t) if isinstance(t,str)
else f' ' if t > cs.font.spaceWidth * cs.fontSize * .667 *.25 else f''
Expand Down
10 changes: 7 additions & 3 deletions pdfstreameditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def __init__(self, xobj:PdfDict, glyphMap:PdfFontGlyphMap,
'''
self.xobj = xobj
self.glyphMap = glyphMap
self.textOnly = textOnly
self.graphicsOnly = graphicsOnly
self.normalize = normalize
self.debug = debug

# Parse the stream tree
Expand Down Expand Up @@ -153,7 +156,7 @@ def recurse(self, recursedFunction:Callable, xobjCache, *args, **kwarg):
The recurse() function tries to solve all of these problems at once: it recurses, as its name
implies, through the graph of xobjects, parses each xobject's stream, calls the recursedFunction
on the parsed stream tree of each xobject, and stores the result of this call in the xobjCache.
on the parsed stream tree of each item in self.XObject, and stores the result of this call in the xobjCache.
By checking the stored results, the recurse() function makes sure it visits each xobject just once.
At the very last, it calls the recursedFunction() on self and returns the result.
Expand All @@ -164,7 +167,8 @@ def recurse(self, recursedFunction:Callable, xobjCache, *args, **kwarg):
for x in xobjects:
if id(x) not in xobjCache and x.Subtype == PdfName.Form and x.stream != None:
# Creates an editor of the same type as that of any inheriting class
editor = type(self)(x, self.glyphMap)
editor = type(self)(x, self.glyphMap, textOnly = self.textOnly, graphicsOnly=self.graphicsOnly,
normalize = self.normalize, debug = self.debug)
xobjCache[id(x)] = editor.recurse(recursedFunction, xobjCache, *args, **kwarg)

return recursedFunction(self, xobjCache, *args, **kwarg)
Expand Down Expand Up @@ -245,7 +249,7 @@ def processTextFunction(self, xobjCache:dict, tree:list=None, state:PdfState = N
# textString = re.sub(r'\n','[newline]',textString)
outText += f"Text: {[cmdText]}\n"
outText += f"BBox: {cmdBBox}\n"
if cs.font != None: outText += f"SpaceWidth: {cs.font.spaceWidth}\n"
if cs.font != None: outText += f"SpaceWidth: {cs.font.spaceWidth}\nEncoding: {cs.font.encoding.cc2glyphname}\n"

if cmd == 'Tf':
outText += f'SetFont: {[cs.font.name, cs.fontSize]}\n'
Expand Down
2 changes: 1 addition & 1 deletion pdfvectorimage.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def vectorize(obj:PdfDict, alpha = 1, upsample=False, smooth=0, vectorize=True):

mask = np.array(image)
mask = np.flipud(mask) # flip upside down since images use inverted coordinate system
mask = np.invert(mask) # black is 1 for masks
# mask = np.invert(mask) # black is 1 for masks

h,w = mask.shape

Expand Down

0 comments on commit 098ab5b

Please sign in to comment.