Various upgrades

sl2c · Apr 23, 2024 · 6441c62 · 6441c62
1 parent ba31199
commit 6441c62
Show file tree

Hide file tree

Showing 18 changed files with 2,919 additions and 1,212 deletions.
diff --git a/color_profiles/USWebCoatedSWOP.icc b/color_profiles/USWebCoatedSWOP.icc
diff --git a/common.py b/common.py
@@ -2,7 +2,8 @@
 
 import inspect,sys
 
-from pdfrw import PdfArray
+from pdfrw import PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfName
+from pdfrwx.pdfgeometry import BOX
 
 # ========================================================= MESSAGES
 
@@ -12,7 +13,7 @@ def eprint(*args, **kwargs):
 def err(msg):
     '''Prints an error message in the form: 'Error in class.func(), line: msg',
     where class.func() are the class and the function that called err().
-    Exits by sys.exit(1) aftewords'''
+    Exits by sys.exit(1) afterwords'''
     stack = inspect.stack()
     the_class = stack[1][0].f_locals["self"].__class__.__name__ if "self" in stack[1][0].f_locals else 'global'
     the_func = stack[1][0].f_code.co_name
@@ -36,18 +37,38 @@ def warn(msg):
     eprint(f'{the_class}.{callerName}(): warning: {msg}')
 
 def er(msg):
-    '''Prints a message in the form 'Error: msg' to stderr, the exits.'''
+    '''Prints a message in the form 'Error: msg' to stderr, then exits.'''
     eprint(f'Error: {msg}')
     sys.exit(1)
 
 # ========================================================================== Dictionaries access
 
 def encapsulate(obj):
     '''
-    Make a PdfArray([obj]) out of obj unless obj is already a PdfArray
+    Returns PdfArray([obj]) if obj is not a PdfArray, or obj itself otherwise.
     '''
     return obj if isinstance(obj,PdfArray) else PdfArray([obj]) if obj != None else PdfArray()
 
+def decapsulate(array:PdfArray):
+    '''
+    Returns None if len(array) == 0, or array[0] if len(array) == 1, or array otherwise. 
+    '''
+    return None if len(array) == 0 else array[0] if len(array) == 1 else array
+
+def get_box(xobj:PdfDict):
+    '''
+    Returns xobj's box, which defaults to BOX(xobj.CropBox/MediaBox) for pdf pages,
+    to BOX(xobj.BBox) for PDF Form xobjects, and to BOX([0,0,1,1]) for Image xobjects.
+    For all other objects returns None
+    '''
+    # f = lambda array: [float(a) for a in array]
+    cropBox = xobj.inheritable.CropBox
+    if cropBox == None: cropBox = xobj.inheritable.MediaBox
+    return BOX(cropBox) if xobj.Contents != None \
+        else BOX(xobj.BBox) if xobj.Subtype == PdfName.Form \
+        else BOX([0,0,1,1]) if xobj.Subtype == PdfName.Image \
+        else None
+
 # ========================================================================== Dictionaries access
 
 def get_key(dic:dict, key:str, defaultValue = None):
@@ -72,3 +93,57 @@ def chain(a:dict, b:dict):
     Returns dictionary c such that for k in a: c[k] = b[a[k]] or a[k] if a[k] is not in b
     '''
     return {k:b.get(v,v) for k,v in a.items()}
+
+
+# ========================================================================== PdfObjSize()
+
+def pdfObjSize(obj:PdfObject, cache:set = set()):
+    '''
+    Returns (size, overhead), where size is the size of the streams of obj and all other
+    dictionaries that the obj references, and overhead is (the estimate of) the corresponding size
+    of their headers (the dictionaries per se). If cache dict is supplied, it is used to
+    eliminate double-counting of dictionaries and arrays.
+    '''
+
+    # if cache != None and (isinstance(obj,PdfDict) or isinstance(obj,PdfArray)):
+    # if cache != None:
+    if id(obj) in cache: return 0, 0
+    cache.add(id(obj))
+
+    REF_OVERHEAD = 8 # a typical '123 0 R ' reference length
+    DICT_OVERHEAD = 5 # '<<>> '
+    ARRAY_OVERHEAD = 3 # '[] '
+
+    if isinstance(obj, PdfDict):
+        size = int(obj.Length) if obj.Length != None else 0
+        overhead = DICT_OVERHEAD
+
+        # Inherit page items
+        INHERITABLE = [PdfName.Resources, PdfName.Rotate, PdfName.MediaBox, PdfName.CropBox]
+        items = {k:v for k,v in obj.items()}
+        if obj.Type == PdfName.Page:
+            for name in INHERITABLE:
+                if name not in items:
+                    inherited = obj.inheritable[name]
+                    if inherited != None:
+                        items[name] = inherited
+
+        for k,v in items.items():
+            # Do not traverse up the page tree or to other pages
+            if k == PdfName.Parent or isinstance(v,PdfDict) and v.Type == PdfName.Page: s,o = 0,0
+            else: s,o = pdfObjSize(v, cache)
+            if isinstance(v,IndirectPdfDict): o += REF_OVERHEAD
+            size += s; overhead += len(k) + o + 2 # 2 separators
+    elif isinstance(obj,PdfArray):
+        size,overhead = 0,ARRAY_OVERHEAD
+        for v in obj:
+            # Do not traverse to other pages
+            if isinstance(v,PdfDict) and v.Type == PdfName.Page: s,o = 0,0
+            else: s,o = pdfObjSize(v, cache)
+            if isinstance(v,IndirectPdfDict): o += REF_OVERHEAD
+            size += s; overhead += o + 1 # 1 separator
+    else:
+        size,overhead = 0, len(str(obj))
+
+    return size, overhead
+
diff --git a/djvusededitor.py b/djvusededitor.py
@@ -77,6 +77,7 @@ def djvusedPageTreeToPDFStream(self, djvusedPageTree:list, font:PdfFont, baselin
                 if len(textUnicode) == 0: continue
                 pdfString = font.encodePdfTextString(textUnicode)
                 stringWidth = font.width(textUnicode)
+                if stringWidth == 0: continue
 
                 scale_x = int(round(font.scaleFactor*(float(xmax)-float(xmin))/stringWidth))
 
@@ -124,7 +125,6 @@ def insert_ocr(self, pdf:PdfReader, defaultUnicodeFont:str, defaultFontDir:str,
             pdfPage = pdf.pages[pageNo-1]
 
             # Remove old OCR
-            stream = ''.join(PdfFilter.uncompress(c).stream for c in encapsulate(pdfPage.contents))
             resources = pdfPage.inheritable.Resources
             if resources == None: resources = PdfDict(); pdfPage.Resources = resources
             pdfEditor = PdfStreamEditor(pdfPage, PdfFontGlyphMap())

diff --git a/pdfbezier.py b/pdfbezier.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+from math import sqrt
 
 class PdfBezier:
 
@@ -50,23 +51,24 @@ def enlarge_box(self, box:list, coords:list):
 
 
     def decimate_bezier(self, tree:list):
-        '''Merge pairs of Bezier segments into single Bezier segments whenever this does not introduce noticeable artifacts.
+        '''
+        Merge pairs of Bezier segments into single Bezier segments whenever this does not introduce noticeable artifacts.
         '''
         if len(tree) == 0: return []
 
         path_construction_commands = ['m','l','c','v','y','re','h','W','W*']
         path_construction_commands_that_move_cursor = ['m','l','c','v','y']
         path_painting_commands = ['s','S','f','F','f*','B','B*','b','b*','n']
         # path_commands = path_construction_commands + path_painting_commands
-        p = lambda x: f'{round(x*1000000)/1000000:f}'.rstrip('0').rstrip('.')
+        p = lambda x: f'{round(x*1000)/1000:f}'.rstrip('0').rstrip('.')
 
         LINE_PRECISION = 0.1
         BEZIER_PRECISION = 0.1
 
         x,y = None, None    # current starting coordinates
         x0,y0 = None, None  # starting coordinates at the start of the replacement curve
         x1,y1 = None, None  # first control point at the start of the replacement curve (for Bezier)
-        x2,y2 = None, None  # current ending coordinates; coptied to x,y at the end of each loop
+        x2,y2 = None, None  # current ending coordinates; copied to x,y at the end of each loop
 
         out = []
         inside = False