Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge page improved and bug fixes. #34

Open
wants to merge 2 commits into
base: trunk
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions pyPdf/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,13 +404,16 @@ def writeToStream(self, stream, encryption_key):
obj = ByteStringObject(bytearr)
obj.writeToStream(stream, None)
else:
stream.write("(")
for c in bytearr:
if not c.isalnum() and c != ' ':
stream.write("\\%03o" % ord(c))
else:
stream.write(c)
stream.write(")")
if bytearr == "/Page" : # correction by Dysmas : otherwise writes (\057Page) instead of /Page, which is valid but not supported by poppler
stream.write(bytearr)
else :
stream.write("(")
for c in bytearr:
if not c.isalnum() and c != ' ':
stream.write("\\%03o" % ord(c))
else:
stream.write(c)
stream.write(")")


class NameObject(str, PdfObject):
Expand Down Expand Up @@ -797,4 +800,3 @@ def decode_pdfdocencoding(byte_array):
continue
assert char not in _pdfDocEncoding_rev
_pdfDocEncoding_rev[char] = i

129 changes: 125 additions & 4 deletions pyPdf/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
A pure-Python PDF library with very minimal capabilities. It was designed to
be able to split and merge PDF files by page, and that's about all it can do.
It may be a solid base for future PDF file work in Python.
version 1.13d
"""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
Expand Down Expand Up @@ -1018,6 +1019,7 @@ def createBlankPage(pdf=None, width=None, height=None):
page.__setitem__(NameObject('/Type'), NameObject('/Page'))
page.__setitem__(NameObject('/Parent'), NullObject())
page.__setitem__(NameObject('/Resources'), DictionaryObject())
page.__setitem__(NameObject('/Contents'), ArrayObject([]))
if width is None or height is None:
if pdf is not None and pdf.getNumPages() > 0:
lastpage = pdf.getPage(pdf.getNumPages() - 1)
Expand Down Expand Up @@ -1061,15 +1063,24 @@ def _mergeResources(res1, res2, resource):
page2Res = res2.get(resource, DictionaryObject()).getObject()
renameRes = {}
for key in page2Res.keys():
if newRes.has_key(key) and newRes[key] != page2Res[key]:
newname = NameObject(key + "renamed")
if newRes.has_key(key) and ( newRes[key] != page2Res[key]
or resource == "/XObject" ) :
i = 1
while True :
if newRes.has_key(key + "renamed" + str(i)) :
i = i + 1
else :
newname = NameObject(key + "renamed" + str(i))
break

renameRes[key] = newname
newRes[newname] = page2Res[key]
elif not newRes.has_key(key):
newRes[key] = page2Res.raw_get(key)
return newRes, renameRes
_mergeResources = staticmethod(_mergeResources)


def _contentStreamRename(stream, rename, pdf):
if not rename:
return stream
Expand All @@ -1092,6 +1103,15 @@ def _pushPopGS(contents, pdf):
return stream
_pushPopGS = staticmethod(_pushPopGS)


def _addCode(contents, pdf, code, endCode = ""):

stream = ContentStream(contents, pdf)
stream.operations.insert(0, [[], code])
stream.operations.append([[], endCode])
return stream
_addCode = staticmethod(_addCode)

def _addTransformationMatrix(contents, pdf, ctm):
# adds transformation matrix at the beginning of the given
# contents stream.
Expand Down Expand Up @@ -1349,6 +1369,107 @@ def scaleTo(self, width, height):
self.mediaBox.getLowerLeft_x ())
self.scale(sx, sy)


# Variant of the mergePage function.
# Merges the content streams of several pages and code strings into one page.
# Resource references (i.e. fonts) are maintained from all pages.
# The parameter ar_data is an array containing code strings and PageObjects.
# ContentStream is called only if necessary because it calls ParseContentStream
# which is slox. Otherwise the Content is directly extracted and added to the code.

def mergePage3(self, ar_data ):

newResources = DictionaryObject()
rename = {}
originalResources = self["/Resources"].getObject()
code_s = ""

if isinstance(ar_data, PageObject) :
ar_data = [ar_data]
strType = type("x")
for data in ar_data :
if isinstance(data, PageObject) :

# Now we work on merging the resource dictionaries. This allows us
# to find out what symbols in the content streams we might need to
# rename.
pagexResources = data["/Resources"].getObject()

for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading":
new, newrename = PageObject._mergeResources(originalResources, pagexResources, res)
if new:
newResources[NameObject(res)] = new
rename.update(newrename)

# Combine /Resources sets.
originalResources.update(newResources)

# Combine /ProcSet sets.
newResources[NameObject("/ProcSet")] = ArrayObject(
frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
frozenset(pagexResources.get("/ProcSet", ArrayObject()).getObject())
)
)

if len(rename) > 0 :
pagexContent = data['/Contents'].getObject()
pagexContent = PageObject._contentStreamRename(pagexContent, rename, self.pdf)
code_s += pagexContent.getData() + "\n"
else :
page_keys = data.keys()
if "/Contents" in page_keys : # if page is not blank
code_s += self.extractContent(data["/Contents"]) + "\n"


else :
code_s += data + "\n"


originalContent = self["/Contents"].getObject()
outputContent = PageObject._addCode(originalContent, self.pdf, code_s)

self[NameObject('/Contents')] = outputContent
self[NameObject('/Resources')] = originalResources



def setContent(self, data ):


newResources = DictionaryObject()
rename = {}
#originalResources = self["/Resources"].getObject()
originalContent = self["/Contents"].getObject()

stream = ContentStream(originalContent, self.pdf)
stream.operations = []
stream.operations.append([[], data])


self[NameObject('/Contents')] = stream
#self[NameObject('/Resources')] = originalResources



def extractContent(self,data) :
code_s = ""
pageContent = data.getObject()
if isinstance(pageContent, ArrayObject) :
for data2 in pageContent :
code_s += self.extractContent(data2)
else :
if isinstance(data, TextStringObject) :
code_s += data
else :
try :
decodedData = filters.decodeStreamData(pageContent)
code_s += decodedData
except :
print "le code n'a pas pu etre extrait"

return code_s


##
# Compresses the size of this page by joining all content streams and
# applying a FlateDecode filter.
Expand Down Expand Up @@ -1552,7 +1673,8 @@ def _getData(self):
op.writeToStream(newdata, None)
newdata.write(" ")
newdata.write(operator)
newdata.write("\n")
newdata.write("\n") #Bug corrected by Dysmas 10/2010

return newdata.getvalue()

def _setData(self, value):
Expand Down Expand Up @@ -1868,4 +1990,3 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr
# output.addPage(page1)
# output.write(file("test\\merge-test.pdf", "wb"))