Skip to content

Commit

Permalink
Various upgrades
Browse files Browse the repository at this point in the history
  • Loading branch information
sl2c committed Apr 23, 2024
1 parent ba31199 commit 6441c62
Show file tree
Hide file tree
Showing 18 changed files with 2,919 additions and 1,212 deletions.
Binary file added color_profiles/USWebCoatedSWOP.icc
Binary file not shown.
83 changes: 79 additions & 4 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import inspect,sys

from pdfrw import PdfArray
from pdfrw import PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfName
from pdfrwx.pdfgeometry import BOX

# ========================================================= MESSAGES

Expand All @@ -12,7 +13,7 @@ def eprint(*args, **kwargs):
def err(msg):
'''Prints an error message in the form: 'Error in class.func(), line: msg',
where class.func() are the class and the function that called err().
Exits by sys.exit(1) aftewords'''
Exits by sys.exit(1) afterwords'''
stack = inspect.stack()
the_class = stack[1][0].f_locals["self"].__class__.__name__ if "self" in stack[1][0].f_locals else 'global'
the_func = stack[1][0].f_code.co_name
Expand All @@ -36,18 +37,38 @@ def warn(msg):
eprint(f'{the_class}.{callerName}(): warning: {msg}')

def er(msg):
'''Prints a message in the form 'Error: msg' to stderr, the exits.'''
'''Prints a message in the form 'Error: msg' to stderr, then exits.'''
eprint(f'Error: {msg}')
sys.exit(1)

# ========================================================================== Dictionaries access

def encapsulate(obj):
'''
Make a PdfArray([obj]) out of obj unless obj is already a PdfArray
Returns PdfArray([obj]) if obj is not a PdfArray, or obj itself otherwise.
'''
return obj if isinstance(obj,PdfArray) else PdfArray([obj]) if obj != None else PdfArray()

def decapsulate(array:PdfArray):
'''
Returns None if len(array) == 0, or array[0] if len(array) == 1, or array otherwise.
'''
return None if len(array) == 0 else array[0] if len(array) == 1 else array

def get_box(xobj:PdfDict):
'''
Returns xobj's box, which defaults to BOX(xobj.CropBox/MediaBox) for pdf pages,
to BOX(xobj.BBox) for PDF Form xobjects, and to BOX([0,0,1,1]) for Image xobjects.
For all other objects returns None
'''
# f = lambda array: [float(a) for a in array]
cropBox = xobj.inheritable.CropBox
if cropBox == None: cropBox = xobj.inheritable.MediaBox
return BOX(cropBox) if xobj.Contents != None \
else BOX(xobj.BBox) if xobj.Subtype == PdfName.Form \
else BOX([0,0,1,1]) if xobj.Subtype == PdfName.Image \
else None

# ========================================================================== Dictionaries access

def get_key(dic:dict, key:str, defaultValue = None):
Expand All @@ -72,3 +93,57 @@ def chain(a:dict, b:dict):
Returns dictionary c such that for k in a: c[k] = b[a[k]] or a[k] if a[k] is not in b
'''
return {k:b.get(v,v) for k,v in a.items()}


# ========================================================================== PdfObjSize()

def pdfObjSize(obj:PdfObject, cache:set = set()):
'''
Returns (size, overhead), where size is the size of the streams of obj and all other
dictionaries that the obj references, and overhead is (the estimate of) the corresponding size
of their headers (the dictionaries per se). If cache dict is supplied, it is used to
eliminate double-counting of dictionaries and arrays.
'''

# if cache != None and (isinstance(obj,PdfDict) or isinstance(obj,PdfArray)):
# if cache != None:
if id(obj) in cache: return 0, 0
cache.add(id(obj))

REF_OVERHEAD = 8 # a typical '123 0 R ' reference length
DICT_OVERHEAD = 5 # '<<>> '
ARRAY_OVERHEAD = 3 # '[] '

if isinstance(obj, PdfDict):
size = int(obj.Length) if obj.Length != None else 0
overhead = DICT_OVERHEAD

# Inherit page items
INHERITABLE = [PdfName.Resources, PdfName.Rotate, PdfName.MediaBox, PdfName.CropBox]
items = {k:v for k,v in obj.items()}
if obj.Type == PdfName.Page:
for name in INHERITABLE:
if name not in items:
inherited = obj.inheritable[name]
if inherited != None:
items[name] = inherited

for k,v in items.items():
# Do not traverse up the page tree or to other pages
if k == PdfName.Parent or isinstance(v,PdfDict) and v.Type == PdfName.Page: s,o = 0,0
else: s,o = pdfObjSize(v, cache)
if isinstance(v,IndirectPdfDict): o += REF_OVERHEAD
size += s; overhead += len(k) + o + 2 # 2 separators
elif isinstance(obj,PdfArray):
size,overhead = 0,ARRAY_OVERHEAD
for v in obj:
# Do not traverse to other pages
if isinstance(v,PdfDict) and v.Type == PdfName.Page: s,o = 0,0
else: s,o = pdfObjSize(v, cache)
if isinstance(v,IndirectPdfDict): o += REF_OVERHEAD
size += s; overhead += o + 1 # 1 separator
else:
size,overhead = 0, len(str(obj))

return size, overhead

2 changes: 1 addition & 1 deletion djvusededitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def djvusedPageTreeToPDFStream(self, djvusedPageTree:list, font:PdfFont, baselin
if len(textUnicode) == 0: continue
pdfString = font.encodePdfTextString(textUnicode)
stringWidth = font.width(textUnicode)
if stringWidth == 0: continue

scale_x = int(round(font.scaleFactor*(float(xmax)-float(xmin))/stringWidth))

Expand Down Expand Up @@ -124,7 +125,6 @@ def insert_ocr(self, pdf:PdfReader, defaultUnicodeFont:str, defaultFontDir:str,
pdfPage = pdf.pages[pageNo-1]

# Remove old OCR
stream = ''.join(PdfFilter.uncompress(c).stream for c in encapsulate(pdfPage.contents))
resources = pdfPage.inheritable.Resources
if resources == None: resources = PdfDict(); pdfPage.Resources = resources
pdfEditor = PdfStreamEditor(pdfPage, PdfFontGlyphMap())
Expand Down
8 changes: 5 additions & 3 deletions pdfbezier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3

from math import sqrt

class PdfBezier:

Expand Down Expand Up @@ -50,23 +51,24 @@ def enlarge_box(self, box:list, coords:list):


def decimate_bezier(self, tree:list):
'''Merge pairs of Bezier segments into single Bezier segments whenever this does not introduce noticeable artifacts.
'''
Merge pairs of Bezier segments into single Bezier segments whenever this does not introduce noticeable artifacts.
'''
if len(tree) == 0: return []

path_construction_commands = ['m','l','c','v','y','re','h','W','W*']
path_construction_commands_that_move_cursor = ['m','l','c','v','y']
path_painting_commands = ['s','S','f','F','f*','B','B*','b','b*','n']
# path_commands = path_construction_commands + path_painting_commands
p = lambda x: f'{round(x*1000000)/1000000:f}'.rstrip('0').rstrip('.')
p = lambda x: f'{round(x*1000)/1000:f}'.rstrip('0').rstrip('.')

LINE_PRECISION = 0.1
BEZIER_PRECISION = 0.1

x,y = None, None # current starting coordinates
x0,y0 = None, None # starting coordinates at the start of the replacement curve
x1,y1 = None, None # first control point at the start of the replacement curve (for Bezier)
x2,y2 = None, None # current ending coordinates; coptied to x,y at the end of each loop
x2,y2 = None, None # current ending coordinates; copied to x,y at the end of each loop

out = []
inside = False
Expand Down
Loading

0 comments on commit 6441c62

Please sign in to comment.