Bug fixes

sl2c · Nov 1, 2023 · 098ab5b · 098ab5b
1 parent 5037291
commit 098ab5b
Show file tree

Hide file tree

Showing 9 changed files with 34 additions and 21 deletions.
diff --git a/common.py b/common.py
@@ -18,18 +18,22 @@ def err(msg):
     the_func = stack[1][0].f_code.co_name
     # the_func = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
     lineno = inspect.getouterframes(inspect.currentframe(), 2)[1][2]
-    eprint(f'Error in {the_class}.{the_func}(), line {lineno}: {msg}')
+    eprint(f'{the_class}.{the_func}(): error in line {lineno}: {msg}')
     sys.exit(1)
 
 def msg(msg):
     '''Prints a warning message in the form: 'func(): warning: msg', where func() is the function that called warn().'''
+    stack = inspect.stack()
+    the_class = stack[1][0].f_locals["self"].__class__.__name__ if "self" in stack[1][0].f_locals else 'global'
     callerName = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
-    eprint(f'{callerName}(): {msg}')
+    eprint(f'{the_class}.{callerName}(): {msg}')
 
 def warn(msg):
     '''Prints a warning message in the form: 'func(): warning: msg', where func() is the function that called warn().'''
+    stack = inspect.stack()
+    the_class = stack[1][0].f_locals["self"].__class__.__name__ if "self" in stack[1][0].f_locals else 'global'
     callerName = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
-    eprint(f'{callerName}(): warning: {msg}')
+    eprint(f'{the_class}.{callerName}(): warning: {msg}')
 
 def er(msg):
     '''Prints a message in the form 'Error: msg' to stderr, the exits.'''

diff --git a/pdffilter.py b/pdffilter.py
@@ -75,7 +75,7 @@ def uncompress(obj:IndirectPdfDict):
             elif filter in ['/FlateDecode', '/Fl', '/LZWDecode', '/LZW']:
 
                 # PDF Ref. 1.7 Sec. 3.3.3
-                if filter == '/FlateDecode':
+                if filter in ['/FlateDecode','/Fl']:
                     stream = zlib.decompress(stream)
                 else:
                     earlyChange = int(get_key(parm, '/EarlyChange', '1'))

diff --git a/pdffont.py b/pdffont.py
@@ -304,10 +304,10 @@ def __init__(self, fontDict:PdfDict, glyphMap:PdfFontGlyphMap = PdfFontGlyphMap(
             except: self.bbox = [0, 0, 1000, 1000]
 
         # Set cmap
-        try: # /ToUnicode may be junk
-            self.cmap = PdfFontCMap(toUnicodeDict = self.font.ToUnicode) if self.font.ToUnicode != None else None
-        except:
-            self.cmap = None
+        if self.font.ToUnicode != None:
+            try: self.cmap = PdfFontCMap(toUnicodeDict = self.font.ToUnicode) # /ToUnicode may be junk
+            except: self.cmap = None
+        else: self.cmap = None
 
         if self.cmap == None:
             if self.is_cid():
@@ -413,7 +413,7 @@ def make_cc2width(self):
         font = self.font
         if font == None: return None
 
-        if font.Subtype in ['/Type1','/Type3','/TrueType']:
+        if font.Subtype in ['/Type1', '/MMType1', '/Type3', '/TrueType']:
 
             # Set cc2width
             if font.Widths == None:

diff --git a/pdffontcmap.py b/pdffontcmap.py
@@ -49,10 +49,11 @@ def read_to_unicode_dict(self, ToUnicode:IndirectPdfDict):
         if stream == None: warn(f'no stream in font\'s ToUnicode object: {ToUnicode}') ; self.set_to_identity_map() ; return
 
         if ToUnicode.Filter != None:
+            # stream = PdfFilter.uncompress(ToUnicode).stream
             try:
                 stream = PdfFilter.uncompress(ToUnicode).stream
             except:
-                warn(f'failed to decompress ToUnicode CMap: {ToUnicode}')
+                warn(f"failed to decompress the font's ToUnicode CMap: {ToUnicode}")
                 return
         self.cc2unicode = {}
 

diff --git a/pdffontglyphmap.py b/pdffontglyphmap.py
@@ -71,15 +71,15 @@ def __init__(self, glyphListPaths:list[str] = [], fonts:PdfObjects = {}, knownPr
 
     def composite_glyphname_to_unicode(self, gname:str):
         ''' For a glyph name of the composite 'prefix + number' form, e.g. 'c13', 'glyph10H', etc.,
-        where prefix is of the form: '[a-zA-Z]|#|FLW|uni|glyph|MT|.*\.g' and suffix is a DEX (decimal/hex)
+        where prefix is of the form: '[a-zA-Z]|#|FLW|uni|Char|glyph|MT|.*\.g' and suffix is a DEX (decimal/hex)
         number: '[0-9a-fA-F]+' returns the number part as int by interpreting the corresponding string
         part as a hex or dec number based on the statistics of previous encounters with the glyph names
         of this form. The usage scenario is to first run this function through all available glyph names
         to train the algorithm, and then call it on any particular glyph name to get results.
         '''
         suffix_type = lambda suffix: self.DEX if all(c in string.digits for c in suffix) else self.HEX
 
-        gname_marked = re.sub(r'^([a-zA-Z]|#|FLW|uni|glyph|MT|.*\.g)([0-9a-fA-F]+)$',r'\1|||\2',gname)
+        gname_marked = re.sub(r'^([a-zA-Z]|#|FLW|uni|Char|glyph|MT|.*\.g)([0-9a-fA-F]+)$',r'\1|||\2',gname)
         gname_split = re.split(r'\|\|\|',gname_marked)
         prefix,suffix = gname_split if len(gname_split) == 2 else (None,None)
         if prefix == None: return None

diff --git a/pdfimage.py b/pdfimage.py
@@ -340,7 +340,7 @@ def decode(obj:IndirectPdfDict, pdfPage:PdfDict = None, adjustColors = True, app
 
         width, height = int(obj.Width), int(obj.Height)
         bpc, cs = PdfImage.get_image_specs(obj)
-        # msg(f'image specs: {bpc}, {cs}')
+        msg(f'image specs: {bpc}, {cs}')
 
         img = None
 
@@ -376,7 +376,9 @@ def decode(obj:IndirectPdfDict, pdfPage:PdfDict = None, adjustColors = True, app
         elif filter == '/JPXDecode': # --> JPEG 2000
 
             # msg('/JPXDecode --> JPEG2000')
-            img = Image.open(BytesIO(stream))
+            warn(f'/JPXDecode implementation is buggy at the moment, decoding is not attempted')
+            # img = Image.open(BytesIO(stream))
+            return None
 
         else:
             warn(f'unsupported stream filter: {filter}')
@@ -545,7 +547,7 @@ def decode(obj:IndirectPdfDict, pdfPage:PdfDict = None, adjustColors = True, app
                 # Restore icc_profile
                 img.info['icc_profile'] = icc_profile
 
-            if img.mode != 'RGB': 
+            if img.mode != 'RGB':
                 intent = obj.Intent if applyIntent else None
                 # msg(f"Converting {img.mode} to sRGB with rendering intent: {intent}")
                 img = ImageUtils.pil_image_to_srgb(img, intent)
@@ -573,8 +575,9 @@ def get_image_specs(obj:PdfDict):
 
         bpc = int(obj.BitsPerComponent)
 
-        if bpc == 1:
-            return 1, PdfColorSpace(mode = '1', cpp = 1)        
+        # This is wrong if the CS is /Indexed (yes, there are indexed bitonal images)
+        # if bpc == 1:
+        #     return 1, PdfColorSpace(mode = '1', cpp = 1)        
 
         return bpc, PdfColorSpace(obj.ColorSpace)
 

diff --git a/pdfstate.py b/pdfstate.py
@@ -136,6 +136,7 @@ def update(self, cmd, args):
             # don't interpret the last displacement as space since it may be followed a negative displacement
             # in TD etc; instead, just ignore it, but include it in the width calculation (next line)
             # this way it will contribute to cs.Tm and will be interpreted as space, if necessary, later
+            # NB: the 0.25 factor is totally ad hoc and needs to be tested! See also same issue in self._get_gap()
             zTight = z if len(z) == 0 or isinstance(z[-1],str) else z[:-1]
             textString = ''.join(cs.font.decodeCodeString(t) if isinstance(t,str)
                                     else f' ' if t > cs.font.spaceWidth * cs.fontSize * .667 *.25 else f''

diff --git a/pdfstreameditor.py b/pdfstreameditor.py
@@ -42,6 +42,9 @@ def __init__(self, xobj:PdfDict, glyphMap:PdfFontGlyphMap,
         '''
         self.xobj = xobj
         self.glyphMap = glyphMap
+        self.textOnly = textOnly
+        self.graphicsOnly = graphicsOnly
+        self.normalize = normalize
         self.debug = debug
 
         # Parse the stream tree
@@ -153,7 +156,7 @@ def recurse(self, recursedFunction:Callable, xobjCache, *args, **kwarg):
 
         The recurse() function tries to solve all of these problems at once: it recurses, as its name
         implies, through the graph of xobjects, parses each xobject's stream, calls the recursedFunction
-        on the parsed stream tree of each xobject, and stores the result of this call in the xobjCache.
+        on the parsed stream tree of each item in self.XObject, and stores the result of this call in the xobjCache.
         By checking the stored results, the recurse() function makes sure it visits each xobject just once.
         At the very last, it calls the recursedFunction() on self and returns the result.
 
@@ -164,7 +167,8 @@ def recurse(self, recursedFunction:Callable, xobjCache, *args, **kwarg):
         for x in xobjects:
             if id(x) not in xobjCache and x.Subtype == PdfName.Form and x.stream != None:
                 # Creates an editor of the same type as that of any inheriting class
-                editor = type(self)(x, self.glyphMap)
+                editor = type(self)(x, self.glyphMap, textOnly = self.textOnly, graphicsOnly=self.graphicsOnly,
+                                    normalize = self.normalize, debug = self.debug)
                 xobjCache[id(x)] = editor.recurse(recursedFunction, xobjCache, *args, **kwarg)
 
         return recursedFunction(self, xobjCache, *args, **kwarg)
@@ -245,7 +249,7 @@ def processTextFunction(self, xobjCache:dict, tree:list=None, state:PdfState = N
                     # textString = re.sub(r'\n','[newline]',textString)
                     outText += f"Text: {[cmdText]}\n"
                     outText += f"BBox: {cmdBBox}\n"
-                    if cs.font != None: outText += f"SpaceWidth: {cs.font.spaceWidth}\n"
+                    if cs.font != None: outText += f"SpaceWidth: {cs.font.spaceWidth}\nEncoding: {cs.font.encoding.cc2glyphname}\n"
 
                 if cmd == 'Tf':
                     outText += f'SetFont: {[cs.font.name, cs.fontSize]}\n'

diff --git a/pdfvectorimage.py b/pdfvectorimage.py
@@ -26,7 +26,7 @@ def vectorize(obj:PdfDict, alpha = 1, upsample=False, smooth=0, vectorize=True):
 
         mask = np.array(image)
         mask = np.flipud(mask) # flip upside down since images use inverted coordinate system
-        mask = np.invert(mask) # black is 1 for masks
+        # mask = np.invert(mask) # black is 1 for masks
 
         h,w = mask.shape