diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 1ecce31c8..8b72f0932 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -1,5 +1,3 @@ -# vim: sw=4:expandtab:foldmethod=marker -# # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # @@ -40,7 +38,7 @@ from cStringIO import StringIO else: from io import StringIO - import struct +import struct try: import zlib @@ -356,6 +354,10 @@ def decode(data, decodeParms=None): class CCITTFaxDecode(object): def decode(data, decodeParms=None, height=0): if decodeParms: + from PyPDF2.generic import ArrayObject + if isinstance(decodeParms, ArrayObject): + if len(decodeParms) == 1: + decodeParms = decodeParms[0] if decodeParms.get("/K", 1) == -1: CCITTgroup = 4 else: @@ -451,6 +453,10 @@ def _xobj_to_image(x_object_obj): img_byte_arr = io.BytesIO() img.save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() + elif x_object_obj["/Filter"] in (["/LZWDecode"], ['/ASCII85Decode'], ['/CCITTFaxDecode']): + from PyPDF2.utils import b_ + extension = ".png" + data = b_(data) elif x_object_obj["/Filter"] == "/DCTDecode": extension = ".jpg" elif x_object_obj["/Filter"] == "/JPXDecode": diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 334d76609..2ae9a7471 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -44,6 +44,8 @@ import decimal import codecs +from PyPDF2.utils import ERR_STREAM_TRUNCATED_PREMATURELY + ObjectPrefix = b_('/<[tf(n%') NumberSigns = b_('+-') IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")) @@ -199,8 +201,7 @@ def readFromStream(stream, pdf): while True: tok = stream.read(1) if not tok: - # stream has truncated prematurely - raise PdfStreamError("Stream has ended unexpectedly") + raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): break idnum += tok @@ -208,8 +209,7 @@ def readFromStream(stream, pdf): while True: tok = stream.read(1) if not tok: - # stream has truncated prematurely - raise PdfStreamError("Stream has ended unexpectedly") + raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY) if tok.isspace(): if not generation: continue @@ -273,10 +273,11 @@ def readFromStream(stream): readFromStream = staticmethod(readFromStream) -## -# Given a string (either a "str" or "unicode"), create a ByteStringObject or a -# TextStringObject to represent the string. def createStringObject(string): + """ + Given a string (either a "str" or "unicode"), create a ByteStringObject or a + TextStringObject to represent the string. + """ if isinstance(string, utils.string_type): return TextStringObject(string) elif isinstance(string, utils.bytes_type): @@ -306,8 +307,7 @@ def readHexStringFromStream(stream): while True: tok = readNonWhitespace(stream) if not tok: - # stream has truncated prematurely - raise PdfStreamError("Stream has ended unexpectedly") + raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY) if tok == b_(">"): break x += tok @@ -328,8 +328,7 @@ def readStringFromStream(stream): while True: tok = stream.read(1) if not tok: - # stream has truncated prematurely - raise PdfStreamError("Stream has ended unexpectedly") + raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY) if tok == b_("("): parens += 1 elif tok == b_(")"): @@ -392,16 +391,17 @@ def readStringFromStream(stream): return createStringObject(txt) -## -# Represents a string object where the text encoding could not be determined. -# This occurs quite often, as the PDF spec doesn't provide an alternate way to -# represent strings -- for example, the encryption data stored in files (like -# /O) is clearly not text, but is still stored in a "String" object. class ByteStringObject(utils.bytes_type, PdfObject): + """ + Represents a string object where the text encoding could not be determined. + This occurs quite often, as the PDF spec doesn't provide an alternate way to + represent strings -- for example, the encryption data stored in files (like + /O) is clearly not text, but is still stored in a "String" object. + """ ## # For compatibility with TextStringObject.original_bytes. This method - # returns self. + # self. original_bytes = property(lambda self: self) def writeToStream(self, stream, encryption_key): @@ -413,12 +413,14 @@ def writeToStream(self, stream, encryption_key): stream.write(b_(">")) -## -# Represents a string object that has been decoded into a real unicode string. -# If read from a PDF document, this string appeared to match the -# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to -# occur. class TextStringObject(utils.string_type, PdfObject): + """ + Represents a string object that has been decoded into a real unicode string. + If read from a PDF document, this string appeared to match the + PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to + occur. + """ + autodetect_pdfdocencoding = False autodetect_utf16 = False @@ -569,8 +571,7 @@ def readFromStream(stream, pdf): skipOverComment(stream) continue if not tok: - # stream has truncated prematurely - raise PdfStreamError("Stream has ended unexpectedly") + raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY) if debug: print(("Tok:", tok)) if tok == b_(">"): diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py index 854d8cdb8..d5fd22414 100644 --- a/PyPDF2/merger.py +++ b/PyPDF2/merger.py @@ -1,5 +1,3 @@ -# vim: sw=4:expandtab:foldmethod=marker -# # Copyright (c) 2006, Mathieu Fenniak # All rights reserved. # diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 0f7692bf5..9bfa1bd0d 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- # -# vim: sw=4:expandtab:foldmethod=marker -# # Copyright (c) 2006, Mathieu Fenniak # Copyright (c) 2007, Ashish Kulkarni # @@ -1637,7 +1635,7 @@ def _getObjectFromStream(self, indirectReference): streamData.seek(0, 0) lines = streamData.readlines() for i in range(0, len(lines)): - print((lines[i])) + print(lines[i]) streamData.seek(pos, 0) try: obj = readObject(streamData, self) @@ -2588,11 +2586,6 @@ def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expan ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], expand) - ## - # Applys a transformation matrix the page. - # - # @param ctm A 6 elements tuple containing the operands of the - # transformation matrix def addTransformation(self, ctm): """ Applies a transformation matrix to the page. diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py index 3270d86f8..87b3a8b24 100644 --- a/PyPDF2/utils.py +++ b/PyPDF2/utils.py @@ -39,7 +39,7 @@ except ImportError: # Py3 import builtins - +ERR_STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" xrange_fn = getattr(builtins, "xrange", range) _basestring = getattr(builtins, "basestring", str) @@ -122,7 +122,7 @@ def skipOverComment(stream): def readUntilRegex(stream, regex, ignore_eof=False): """ Reads until the regular expression pattern matched (ignore the match) - Raise PdfStreamError on premature end-of-file. + :raises PdfStreamError: on premature end-of-file :param bool ignore_eof: If true, ignore end-of-line and return immediately """ name = b_('') @@ -133,7 +133,7 @@ def readUntilRegex(stream, regex, ignore_eof=False): if ignore_eof: return name else: - raise PdfStreamError("Stream has ended unexpectedly") + raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY) m = regex.search(tok) if m is not None: name += tok[:m.start()] @@ -242,7 +242,6 @@ def b_(s): bc[s] = r return r except Exception: - print(s) r = s.encode('utf-8') if len(s) < 2: bc[s] = r diff --git a/Resources/imagemagick-ASCII85Decode.pdf b/Resources/imagemagick-ASCII85Decode.pdf new file mode 100644 index 000000000..46aabc0fc Binary files /dev/null and b/Resources/imagemagick-ASCII85Decode.pdf differ diff --git a/Resources/imagemagick-CCITTFaxDecode.pdf b/Resources/imagemagick-CCITTFaxDecode.pdf new file mode 100644 index 000000000..e5cbe2043 Binary files /dev/null and b/Resources/imagemagick-CCITTFaxDecode.pdf differ diff --git a/Resources/imagemagick-images.pdf b/Resources/imagemagick-images.pdf new file mode 100644 index 000000000..a5b13392a Binary files /dev/null and b/Resources/imagemagick-images.pdf differ diff --git a/Resources/imagemagick-lzw.pdf b/Resources/imagemagick-lzw.pdf new file mode 100644 index 000000000..b57e07f25 Binary files /dev/null and b/Resources/imagemagick-lzw.pdf differ diff --git a/Resources/metadata.pdf b/Resources/metadata.pdf new file mode 100644 index 000000000..a69369d74 Binary files /dev/null and b/Resources/metadata.pdf differ diff --git a/Tests/test_basic_features.py b/Tests/test_basic_features.py index f3a41fe41..63b2a0a66 100644 --- a/Tests/test_basic_features.py +++ b/Tests/test_basic_features.py @@ -2,9 +2,9 @@ import pytest -from PyPDF2 import PdfFileWriter, PdfFileReader -from PyPDF2.utils import PdfReadError +from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2.pdf import convertToInt +from PyPDF2.utils import PdfReadError TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -12,50 +12,50 @@ def test_basic_features(): - output = PdfFileWriter() - document1 = os.path.join(RESOURCE_ROOT, "crazyones.pdf") - input1 = PdfFileReader(document1) + writer = PdfFileWriter() + pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") + reader = PdfFileReader(pdf_path) # print how many pages input1 has: - print("document1.pdf has %d pages." % input1.getNumPages()) + print("document1.pdf has %d pages." % reader.getNumPages()) # add page 1 from input1 to output document, unchanged - output.addPage(input1.getPage(0)) + writer.addPage(reader.getPage(0)) # add page 2 from input1, but rotated clockwise 90 degrees - output.addPage(input1.getPage(0).rotateClockwise(90)) + writer.addPage(reader.getPage(0).rotateClockwise(90)) # add page 3 from input1, rotated the other way: - output.addPage(input1.getPage(0).rotateCounterClockwise(90)) + writer.addPage(reader.getPage(0).rotateCounterClockwise(90)) # alt: output.addPage(input1.getPage(0).rotateClockwise(270)) # add page 4 from input1, but first add a watermark from another PDF: - page4 = input1.getPage(0) - watermark_pdf = document1 + page4 = reader.getPage(0) + watermark_pdf = pdf_path watermark = PdfFileReader(watermark_pdf) page4.mergePage(watermark.getPage(0)) - output.addPage(page4) + writer.addPage(page4) # add page 5 from input1, but crop it to half size: - page5 = input1.getPage(0) + page5 = reader.getPage(0) page5.mediaBox.upperRight = ( page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2, ) - output.addPage(page5) + writer.addPage(page5) # add some Javascript to launch the print window on opening this PDF. # the password dialog may prevent the print dialog from being shown, # comment the the encription lines, if that's the case, to try this out - output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # encrypt your new PDF and add a password password = "secret" - output.encrypt(password) + writer.encrypt(password) # finally, write "output" to PyPDF2-output.pdf with open("PyPDF2-output.pdf", "wb") as outputStream: - output.write(outputStream) + writer.write(outputStream) def test_convertToInt(): diff --git a/Tests/test_javascript.py b/Tests/test_javascript.py index d49f4dc26..4048a76f0 100644 --- a/Tests/test_javascript.py +++ b/Tests/test_javascript.py @@ -1,4 +1,5 @@ import os + import pytest from PyPDF2 import PdfFileReader, PdfFileWriter @@ -8,21 +9,28 @@ PROJECT_ROOT = os.path.dirname(TESTS_ROOT) RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources") + @pytest.fixture def pdf_file_writer(): - ipdf = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) + reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) pdf_file_writer = PdfFileWriter() - pdf_file_writer.appendPagesFromReader(ipdf) + pdf_file_writer.appendPagesFromReader(reader) yield pdf_file_writer + def test_add_js(pdf_file_writer): - pdf_file_writer.addJS( - "this.print({bUI:true,bSilent:false,bShrinkToFit:true});" - ) + pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + + assert ( + "/Names" in pdf_file_writer._root_object + ), "addJS should add a name catalog in the root object." + assert ( + "/JavaScript" in pdf_file_writer._root_object["/Names"] + ), "addJS should add a JavaScript name tree under the name catalog." + assert ( + "/OpenAction" in pdf_file_writer._root_object + ), "addJS should add an OpenAction to the catalog." - assert "/Names" in pdf_file_writer._root_object, "addJS should add a name catalog in the root object." - assert "/JavaScript" in pdf_file_writer._root_object["/Names"], "addJS should add a JavaScript name tree under the name catalog." - assert "/OpenAction" in pdf_file_writer._root_object, "addJS should add an OpenAction to the catalog." def test_overwrite_js(pdf_file_writer): def get_javascript_name(): @@ -31,14 +39,12 @@ def get_javascript_name(): assert "/Names" in pdf_file_writer._root_object["/Names"]["/JavaScript"] return pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][0] - pdf_file_writer.addJS( - "this.print({bUI:true,bSilent:false,bShrinkToFit:true});" - ) + pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") first_js = get_javascript_name() - pdf_file_writer.addJS( - "this.print({bUI:true,bSilent:false,bShrinkToFit:true});" - ) + pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") second_js = get_javascript_name() - assert first_js != second_js, "addJS should overwrite the previous script in the catalog." + assert ( + first_js != second_js + ), "addJS should overwrite the previous script in the catalog." diff --git a/Tests/test_merger.py b/Tests/test_merger.py index 49048a741..959560d4a 100644 --- a/Tests/test_merger.py +++ b/Tests/test_merger.py @@ -14,6 +14,7 @@ def test_merge(): pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") outline = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf") pdf_forms = os.path.join(RESOURCE_ROOT, "pdflatex-forms.pdf") + pdf_pw = os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf") file_merger = PyPDF2.PdfFileMerger() @@ -23,20 +24,24 @@ def test_merge(): file_merger.append(pdf_path, pages=PyPDF2.pagerange.PageRange(slice(0, 0))) file_merger.append(pdf_forms) - # PdfFileReader object: - file_merger.append(PyPDF2.PdfFileReader(pdf_path, "rb")) + # Merging an encrypted file + pdfr = PyPDF2.PdfFileReader(pdf_pw) + pdfr.decrypt("openpassword") + file_merger.append(pdfr) - # Is merging encrypted files broken? - # encrypted = os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf") - # reader = PyPDF2.PdfFileReader(pdf_path, "rb") - # reader.decrypt("openpassword") - # file_merger.append(reader) + # PdfFileReader object: + file_merger.append(PyPDF2.PdfFileReader(pdf_path, "rb"), bookmark=True) # File handle with open(pdf_path, "rb") as fh: file_merger.append(fh) - file_merger.addBookmark("A bookmark", 0) + bookmark = file_merger.addBookmark("A bookmark", 0) + file_merger.addBookmark("deeper", 0, parent=bookmark) + file_merger.addMetadata({"author": "Martin Thoma"}) + file_merger.addNamedDestination("title", 0) + file_merger.setPageLayout("/SinglePage") + file_merger.setPageMode("/UseThumbs") file_merger.write("dont_commit_merged.pdf") file_merger.close() diff --git a/Tests/test_page.py b/Tests/test_page.py index 5b15f9f58..91c49bab1 100644 --- a/Tests/test_page.py +++ b/Tests/test_page.py @@ -1,5 +1,7 @@ import os +import pytest + from PyPDF2 import PdfFileReader TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -7,15 +9,40 @@ RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources") -def test_page_operations(): +@pytest.mark.parametrize( + "pdf_path, password", + [ + (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), None), + (os.path.join(RESOURCE_ROOT, "attachment.pdf"), None), + (os.path.join(RESOURCE_ROOT, "side-by-side-subfig.pdf"), None), + ( + os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), + "openpassword", + ), + (os.path.join(RESOURCE_ROOT, "imagemagick-images.pdf"), None), + (os.path.join(RESOURCE_ROOT, "imagemagick-lzw.pdf"), None), + ], + ids=[ + "crazyones", + "attachment", + "side-by-side-subfig", + "libreoffice-writer-password", + "imagemagick-images", + "imagemagick-lzw", + ], +) +def test_page_operations(pdf_path, password): """ This test just checks if the operation throws an exception. This should be done way more thoroughly: It should be checked if the output is as expected. """ - pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") reader = PdfFileReader(pdf_path) + + if password: + reader.decrypt(password) + page = reader.pages[0] page.mergeRotatedScaledPage(page, 90, 1, 1) page.mergeScaledTranslatedPage(page, 1, 1, 1) @@ -26,3 +53,23 @@ def test_page_operations(): page.scaleTo(100, 100) page.compressContentStreams() page.extractText() + + +@pytest.mark.parametrize( + "pdf_path, password", + [ + (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), None), + (os.path.join(RESOURCE_ROOT, "attachment.pdf"), None), + (os.path.join(RESOURCE_ROOT, "side-by-side-subfig.pdf"), None), + ( + os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), + "openpassword", + ), + ], +) +def test_compress_content_streams(pdf_path, password): + reader = PdfFileReader(pdf_path) + if password: + reader.decrypt(password) + for page in reader.pages: + page.compressContentStreams() diff --git a/Tests/test_pagerange.py b/Tests/test_pagerange.py index b213de5a0..05bd5fee0 100644 --- a/Tests/test_pagerange.py +++ b/Tests/test_pagerange.py @@ -3,6 +3,18 @@ from PyPDF2.pagerange import PageRange, ParseError, parse_filename_page_ranges +def test_equality(): + pr1 = PageRange(slice(0, 5)) + pr2 = PageRange(slice(0, 5)) + assert pr1 == pr2 + + +def test_equality_other_objectc(): + pr1 = PageRange(slice(0, 5)) + pr2 = "PageRange(slice(0, 5))" + assert pr1 != pr2 + + def test_idempotency(): pr = PageRange(slice(0, 5)) pr2 = PageRange(pr) diff --git a/Tests/test_reader.py b/Tests/test_reader.py index 6cf736d59..d4cbafb12 100644 --- a/Tests/test_reader.py +++ b/Tests/test_reader.py @@ -1,7 +1,10 @@ import io import os + import pytest + import PyPDF2.utils +from PyPDF2 import PdfFileReader from PyPDF2.filters import _xobj_to_image TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -9,16 +12,44 @@ RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources") -def test_read_metadata(): - with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile: - ipdf = PyPDF2.PdfFileReader(inputfile) - metadict = ipdf.getDocumentInfo() - assert metadict.title is None - assert dict(metadict) == { - "/CreationDate": "D:20150604133406-06'00'", - "/Creator": " XeTeX output 2015.06.04:1334", - "/Producer": "xdvipdfmx (20140317)", - } +@pytest.mark.parametrize( + "pdf_path, expected", + [ + ( + os.path.join(RESOURCE_ROOT, "crazyones.pdf"), + { + "/CreationDate": "D:20150604133406-06'00'", + "/Creator": " XeTeX output 2015.06.04:1334", + "/Producer": "xdvipdfmx (20140317)", + }, + ), + ( + os.path.join(RESOURCE_ROOT, "metadata.pdf"), + { + "/CreationDate": "D:20220415093243+02'00'", + "/ModDate": "D:20220415093243+02'00'", + "/Creator": "pdflatex, or other tool", + "/Producer": "Latex with hyperref, or other system", + "/Author": "Martin Thoma", + "/Keywords": "Some Keywords, other keywords; more keywords", + "/Subject": "The Subject", + "/Title": "The Title", + "/Trapped": "/False", + "/PTEX.Fullbanner": ( + "This is pdfTeX, Version " + "3.141592653-2.6-1.40.23 (TeX Live 2021) " + "kpathsea version 6.3.3" + ), + }, + ), + ], + ids=["crazyones", "metadata"], +) +def test_read_metadata(pdf_path, expected): + with open(pdf_path, "rb") as inputfile: + reader = PdfFileReader(inputfile) + metadict = reader.getDocumentInfo() + assert dict(metadict) == expected @pytest.mark.parametrize( @@ -29,16 +60,14 @@ def test_read_metadata(): ], ) def test_get_annotations(src): - reader = PyPDF2.PdfFileReader(src) + reader = PdfFileReader(src) for page in reader.pages: - print("/Annots" in page) if "/Annots" in page: for annot in page["/Annots"]: subtype = annot.getObject()["/Subtype"] if subtype == "/Text": - print(annot.getObject()["/Contents"]) - print("") + annot.getObject()["/Contents"] @pytest.mark.parametrize( @@ -49,7 +78,7 @@ def test_get_annotations(src): ], ) def test_get_attachments(src): - reader = PyPDF2.PdfFileReader(src) + reader = PdfFileReader(src) attachments = {} for i in range(reader.getNumPages()): @@ -71,7 +100,7 @@ def test_get_attachments(src): ], ) def test_get_outlines(src, outline_elements): - reader = PyPDF2.PdfFileReader(src) + reader = PdfFileReader(src) outlines = reader.getOutlines() assert len(outlines) == outline_elements @@ -79,13 +108,17 @@ def test_get_outlines(src, outline_elements): @pytest.mark.parametrize( "src,nb_images", [ - (os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"), 0), - (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), 0), - (os.path.join(RESOURCE_ROOT, "git.pdf"), 1), + ("pdflatex-outline.pdf", 0), + ("crazyones.pdf", 0), + ("git.pdf", 1), + ("imagemagick-lzw.pdf", 1), + ("imagemagick-ASCII85Decode.pdf", 1), + ("imagemagick-CCITTFaxDecode.pdf", 1), ], ) def test_get_images(src, nb_images): - reader = PyPDF2.PdfFileReader(src) + src =os.path.join(RESOURCE_ROOT, src) + reader = PdfFileReader(src) with pytest.raises(TypeError): page = reader.pages["0"] @@ -106,11 +139,13 @@ def test_get_images(src, nb_images): with open(filename, "wb") as img: img.write(byte_stream) images_extracted.append(filename) - else: - print("No image found.") assert len(images_extracted) == nb_images + # Cleanup + for filepath in images_extracted: + os.remove(filepath) + @pytest.mark.parametrize( "strict,with_prev_0,should_fail", @@ -153,10 +188,10 @@ def test_get_images_raw(strict, with_prev_0, should_fail): ) pdf_stream = io.BytesIO(pdf_data) if should_fail: - with pytest.raises(PyPDF2.pdf.utils.PdfReadError): - PyPDF2.PdfFileReader(pdf_stream, strict=strict) + with pytest.raises(PyPDF2.utils.PdfReadError): + PdfFileReader(pdf_stream, strict=strict) else: - PyPDF2.PdfFileReader(pdf_stream, strict=strict) + PdfFileReader(pdf_stream, strict=strict) @pytest.mark.xfail( @@ -167,5 +202,5 @@ def test_get_images_raw(strict, with_prev_0, should_fail): ) def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") - reader = PyPDF2.PdfFileReader(path, "rb") + reader = PdfFileReader(path, "rb") reader.getPage(0) diff --git a/Tests/test_utils.py b/Tests/test_utils.py index a305dff7e..fb31edb9f 100644 --- a/Tests/test_utils.py +++ b/Tests/test_utils.py @@ -1,6 +1,14 @@ +import io +import os + import pytest + import PyPDF2.utils -import io +from PyPDF2 import PdfFileReader + +TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) +PROJECT_ROOT = os.path.dirname(TESTS_ROOT) +RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources") @pytest.mark.parametrize( @@ -10,6 +18,10 @@ def test_isInt(value, expected): assert PyPDF2.utils.isInt(value) == expected +def test_isBytes(): + assert PyPDF2.utils.isBytes(b"") + + @pytest.mark.parametrize( "stream,expected", [ @@ -73,3 +85,15 @@ def test_matrixMultiply(a, b, expected): def test_markLocation(): stream = io.BytesIO(b"abde" * 6000) PyPDF2.utils.markLocation(stream) + + +def test_ConvertFunctionsToVirtualList(): + pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") + reader = PdfFileReader(pdf_path) + + # Test if getting as slice throws an error + assert len(reader.pages[:]) == 1 + + +def test_hexStr(): + assert PyPDF2.utils.hexStr(10) == "0xa" diff --git a/Tests/test_workflows.py b/Tests/test_workflows.py index 9c8f17408..821fa7a49 100644 --- a/Tests/test_workflows.py +++ b/Tests/test_workflows.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- -import os import binascii +import os import sys + import pytest from PyPDF2 import PdfFileReader diff --git a/Tests/test_writer.py b/Tests/test_writer.py index bd8cd9d8c..8356c94bc 100644 --- a/Tests/test_writer.py +++ b/Tests/test_writer.py @@ -1,9 +1,10 @@ import os + import pytest from PyPDF2 import PdfFileReader, PdfFileWriter -from PyPDF2.utils import PageSizeNotDefinedError from PyPDF2.generic import RectangleObject +from PyPDF2.utils import PageSizeNotDefinedError TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -80,3 +81,31 @@ def test_remove_images(): # Cleanup os.remove(tmp_filename) + + +def test_write_metadata(): + pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") + + reader = PdfFileReader(pdf_path) + writer = PdfFileWriter() + + for page in reader.pages: + writer.addPage(page) + + metadata = reader.getDocumentInfo() + writer.addMetadata(metadata) + + writer.addMetadata({"/Title": "The Crazy Ones"}) + + # finally, write data to PyPDF2-output.pdf + tmp_filename = "dont_commit_writer_added_metadata.pdf" + with open(tmp_filename, "wb") as output_stream: + writer.write(output_stream) + + # Check if the title was set + reader = PdfFileReader(tmp_filename) + metadata = reader.getDocumentInfo() + assert metadata.get("/Title") == "The Crazy Ones" + + # Cleanup + os.remove(tmp_filename) diff --git a/Tests/test_xmp.py b/Tests/test_xmp.py index 8fc7bf3a8..941f9d30d 100644 --- a/Tests/test_xmp.py +++ b/Tests/test_xmp.py @@ -1,6 +1,9 @@ import os + import pytest -import PyPDF2 + +import PyPDF2.xmp +from PyPDF2 import PdfFileReader TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -15,9 +18,27 @@ ], ) def test_read_xmp(src, has_xmp): - with open(src, "rb") as inputfile: - ipdf = PyPDF2.PdfFileReader(inputfile) - xmp = ipdf.getXmpMetadata() - assert (xmp is None) == (not has_xmp) - if has_xmp: - print(xmp.xmp_createDate) + reader = PdfFileReader(src) + xmp = reader.getXmpMetadata() + assert (xmp is None) == (not has_xmp) + if has_xmp: + for el in xmp.getElement( + aboutUri="", namespace=PyPDF2.xmp.RDF_NAMESPACE, name="Artist" + ): + print("el={el}".format(el=el)) + + assert get_all_tiff(xmp) == {"tiff:Artist": ["me"]} + assert xmp.dc_contributor == [] + + +def get_all_tiff(xmp): + data = {} + tiff_ns = xmp.getNodesInNamespace( + aboutUri="", namespace="http://ns.adobe.com/tiff/1.0/" + ) + for tag in tiff_ns: + contents = [] + for content in tag.childNodes: + contents.append(content.data) + data[tag.tagName] = contents + return data