Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: Fix test_get_images #730

Merged
merged 3 commits into from
Apr 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,3 +420,48 @@ def decodeStreamData(stream):
# unsupported filter
raise NotImplementedError("unsupported filter %s" % filterType)
return data


def _xobj_to_image(x_object_obj):
"""
Users need to have the pillow package installed.

It's unclear if PyPDF2 will keep this function here, hence it's private.
It might get removed at any point.

:return: Tuple[file extension, bytes]
"""
import io
from PIL import Image

size = (x_object_obj["/Width"], x_object_obj["/Height"])
data = x_object_obj.getData()
if x_object_obj["/ColorSpace"] == "/DeviceRGB":
mode = "RGB"
else:
mode = "P"
extension = None
if "/Filter" in x_object_obj:
if x_object_obj["/Filter"] == "/FlateDecode":
extension = ".png"
img = Image.frombytes(mode, size, data)
if "/SMask" in x_object_obj: # add alpha channel
alpha = Image.frombytes("L", size, x_object_obj["/SMask"].getData())
img.putalpha(alpha)
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()
elif x_object_obj["/Filter"] == "/DCTDecode":
extension = ".jpg"
elif x_object_obj["/Filter"] == "/JPXDecode":
extension = ".jp2"
elif x_object_obj["/Filter"] == "/CCITTFaxDecode":
extension = ".tiff"
else:
extension = ".png"
img = Image.frombytes(mode, size, data)
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()

return extension, data
66 changes: 23 additions & 43 deletions Scripts/pdf-image-extractor.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,37 @@
'''
"""
Extract images from PDF without resampling or altering.

Adapted from work by Sylvain Pelissier
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
'''
"""

import sys
import PyPDF2
from PIL import Image
from PyPDF2.filters import _xobj_to_image

if (len(sys.argv) != 2):
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
sys.exit(1)

pdf = sys.argv[1]
def main(pdf: str):
reader = PyPDF2.PdfFileReader(open(pdf, "rb"))
page = reader.pages[30]

if __name__ == '__main__':
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
page0 = input1.getPage(30)

if '/XObject' in page0['/Resources']:
xObject = page0['/Resources']['/XObject'].getObject()
if "/XObject" in page["/Resources"]:
xObject = page["/Resources"]["/XObject"].getObject()

for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"

if '/Filter' in xObject[obj]:
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
if "/SMask" in xObject[obj]: # add alpha channel
alpha = Image.frombytes("L", size, xObject[obj]["/SMask"].getData())
img.putalpha(alpha)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
img = open(obj[1:] + ".tiff", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
if xObject[obj]["/Subtype"] == "/Image":
extension, byte_stream = _xobj_to_image(xObject[obj])
if extension is not None:
filename = obj[1:] + ".png"
with open(filename, "wb") as img:
img.write(byte_stream)
else:
print("No image found.")


if __name__ == "__main__":
if len(sys.argv) != 2:
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
sys.exit(1)

pdf = sys.argv[1]
main(pdf)
58 changes: 12 additions & 46 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import pytest
import PyPDF2
from PyPDF2.filters import decodeStreamData, _xobj_to_image

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
Expand Down Expand Up @@ -84,61 +85,26 @@ def test_get_outlines(src, outline_elements):
],
)
def test_get_images(src, nb_images):
from PIL import Image

input1 = PyPDF2.PdfFileReader(open(src, "rb"))
reader = PyPDF2.PdfFileReader(open(src, "rb"))

with pytest.raises(TypeError):
page0 = input1.pages["0"]
page = reader.pages["0"]

page0 = input1.pages[-1]
page0 = input1.pages[0]
page = reader.pages[-1]
page = reader.pages[0]

images_extracted = []

if "/XObject" in page0["/Resources"]:
xObject = page0["/Resources"]["/XObject"].getObject()
if "/XObject" in page["/Resources"]:
xObject = page["/Resources"]["/XObject"].getObject()

for obj in xObject:
if xObject[obj]["/Subtype"] == "/Image":
size = (xObject[obj]["/Width"], xObject[obj]["/Height"])
data = xObject[obj].getData()
if xObject[obj]["/ColorSpace"] == "/DeviceRGB":
mode = "RGB"
else:
mode = "P"

filename = None
if "/Filter" in xObject[obj]:
if xObject[obj]["/Filter"] == "/FlateDecode":
img = Image.frombytes(mode, size, data)
if "/SMask" in xObject[obj]: # add alpha channel
alpha = Image.frombytes(
"L", size, xObject[obj]["/SMask"].getData()
)
img.putalpha(alpha)
filename = obj[1:] + ".png"
img.save(filename)
elif xObject[obj]["/Filter"] == "/DCTDecode":
filename = obj[1:] + ".jpg"
img = open(filename, "wb")
img.write(data)
img.close()
elif xObject[obj]["/Filter"] == "/JPXDecode":
filename = obj[1:] + ".jp2"
img = open(filename, "wb")
img.write(data)
img.close()
elif xObject[obj]["/Filter"] == "/CCITTFaxDecode":
filename = obj[1:] + ".tiff"
img = open(filename, "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
extension, byte_stream = _xobj_to_image(xObject[obj])
if extension is not None:
filename = obj[1:] + ".png"
img.save(filename)
if filename is not None:
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
else:
print("No image found.")
Expand All @@ -155,7 +121,7 @@ def test_get_images(src, nb_images):
(False, False, False),
],
)
def test_get_images(strict, with_prev_0, should_fail):
def test_get_images_raw(strict, with_prev_0, should_fail):
pdf_data = b"%%PDF-1.7\n" \
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" \
b"2 0 obj << >> endobj\n" \
Expand Down