Skip to content

Commit

Permalink
[MRG + 1] Add basic support for encrypted PDF files (#180)
Browse files Browse the repository at this point in the history
* [MRG] Add basic support for encrypted PDF files

Update API and CLI to accept ASCII passwords to decrypt PDFs
encrypted by algorithm code 1 or 2 (limited by support from PyPDF2).
Update documentation and unit tests accordingly.

Example document health_protected.pdf generated as follows:
qpdf --encrypt userpass ownerpass 128 -- health.pdf health_protected.pdf

Issue #162

* Support encrypted PDF files in python3

Issue #162

* Address review comments

Explicitly check passwords for None rather than falsey.
Correct read_pdf documentation for Owner/User password.

Issue #162

* Correct API documentation changes for consistency

Issue #162

* Move error tests from test_common to test_errors

Issue #162

* Add qpdf example

* Remove password is not None check

* Fix merge conflict

* Fix pages example
  • Loading branch information
rbares authored and vinayak-mehta committed Oct 28, 2018
1 parent 4366313 commit 429640f
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 30 deletions.
1 change: 1 addition & 0 deletions camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def set_config(self, key, value):
@click.version_option(version=__version__)
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
' Example: 1,3,4 or 1,4-end.')
@click.option('-pw', '--password', help='Password for decryption.')
@click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format',
type=click.Choice(['csv', 'json', 'excel', 'html']),
Expand Down
19 changes: 15 additions & 4 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-

import os
import sys

from PyPDF2 import PdfFileReader, PdfFileWriter

Expand All @@ -21,14 +22,22 @@ class PDFHandler(object):
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
Example: '1,3,4' or '1,4-end'.
password : str, optional (default: None)
Password for decryption.
"""
def __init__(self, filename, pages='1'):
def __init__(self, filename, pages='1', password=None):
self.filename = filename
if not filename.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages)
if password is None:
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')

def _get_pages(self, filename, pages):
"""Converts pages string to list of ints.
Expand All @@ -52,6 +61,8 @@ def _get_pages(self, filename, pages):
page_numbers.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filename, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
else:
Expand Down Expand Up @@ -84,7 +95,7 @@ def _save_page(self, filename, page, temp):
with open(filename, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt('')
infile.decrypt(self.password)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
Expand All @@ -103,7 +114,7 @@ def _save_page(self, filename, page, temp):
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt('')
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == 'anticlockwise':
Expand Down
8 changes: 5 additions & 3 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from .utils import validate_input, remove_extra


def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
**kwargs):
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
suppress_warnings=False, **kwargs):
"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream'
Expand All @@ -19,6 +19,8 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'.
password : str, optional (default: None)
Password for decryption.
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
Expand Down Expand Up @@ -94,7 +96,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages)
p = PDFHandler(filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, **kwargs)
return tables
42 changes: 22 additions & 20 deletions docs/user/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,28 @@ You can print the help for the interface by typing ``camelot --help`` in your fa

::

Usage: camelot [OPTIONS] COMMAND [ARGS]...
Usage: camelot [OPTIONS] COMMAND [ARGS]...

Camelot: PDF Table Extraction for Humans

Options:
--version Show the version and exit.
-p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
or 1,4-end.
-o, --output TEXT Output file path.
-f, --format [csv|json|excel|html]
Output file format.
-z, --zip Create ZIP archive.
-split, --split_text Split text that spans across multiple cells.
-flag, --flag_size Flag text based on font size. Useful to
detect super/subscripts.
-M, --margins <FLOAT FLOAT FLOAT>...
PDFMiner char_margin, line_margin and
word_margin.
--help Show this message and exit.

Commands:
lattice Use lines between text to parse the table.
stream Use spaces between text to parse the table.
Options:
--version Show the version and exit.
-p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
or 1,4-end.
-pw, --password TEXT Password for decryption.
-o, --output TEXT Output file path.
-f, --format [csv|json|excel|html]
Output file format.
-z, --zip Create ZIP archive.
-split, --split_text Split text that spans across multiple cells.
-flag, --flag_size Flag text based on font size. Useful to
detect super/subscripts.
-M, --margins <FLOAT FLOAT FLOAT>...
PDFMiner char_margin, line_margin and
word_margin.
-q, --quiet Suppress warnings.
--help Show this message and exit.

Commands:
lattice Use lines between text to parse the table.
stream Use spaces between text to parse the table.
26 changes: 24 additions & 2 deletions docs/user/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,28 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp

The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.

------------------------
Reading encrypted PDFs
----------------------

Ready for more? Check out the :ref:`advanced <advanced>` section.
To extract tables from encrypted PDF files you must provide a password when calling :meth:`read_pdf() <camelot.read_pdf>`.

::

>>> tables = camelot.read_pdf('foo.pdf', password='userpass')
>>> tables
<TableList n=1>

Currently Camelot only supports PDFs encrypted with ASCII passwords and algorithm `code 1 or 2`_. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm.

Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() <camelot.read_pdf>`. This can been successfully achieved with third-party tools such as `QPDF`_.

::

$ qpdf --password=<PASSWORD> --decrypt input.pdf output.pdf

.. _code 1 or 2: https://github.com/mstamy2/PyPDF2/issues/378
.. _QPDF: https://www.github.com/qpdf/qpdf

----

Ready for more? Check out the :ref:`advanced <advanced>` section.
Binary file added tests/files/health_protected.pdf
Binary file not shown.
26 changes: 25 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,30 @@ def test_cli_stream():
assert format_error in result.output


def test_cli_password():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, 'health_protected.pdf')
outfile = os.path.join(tempdir, 'health_protected.csv')
runner = CliRunner()
result = runner.invoke(cli, ['--password', 'userpass',
'--format', 'csv', '--output', outfile,
'stream', infile])
assert result.exit_code == 0
assert result.output == 'Found 1 tables\n'

output_error = 'file has not been decrypted'
# no password
result = runner.invoke(cli, ['--format', 'csv', '--output', outfile,
'stream', infile])
assert output_error in str(result.exception)

# bad password
result = runner.invoke(cli, ['--password', 'wrongpass',
'--format', 'csv', '--output', outfile,
'stream', infile])
assert output_error in str(result.exception)


def test_cli_output_format():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, 'health.pdf')
Expand All @@ -78,7 +102,7 @@ def test_cli_output_format():
'stream', infile])
assert result.exit_code == 0

def test_cli_quiet_flag():
def test_cli_quiet():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, 'blank.pdf')
outfile = os.path.join(tempdir, 'blank.csv')
Expand Down
11 changes: 11 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ def test_parsing_report():
assert tables[0].parsing_report == parsing_report


def test_password():
df = pd.DataFrame(data_stream)

filename = os.path.join(testdir, "health_protected.pdf")
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
assert df.equals(tables[0].df)

tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
assert df.equals(tables[0].df)


def test_stream():
df = pd.DataFrame(data_stream)

Expand Down
14 changes: 14 additions & 0 deletions tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,17 @@ def _find_executable_patch(arg):
filename = os.path.join(testdir, 'foo.pdf')
with pytest.raises(Exception, message=message):
tables = camelot.read_pdf(filename)


def test_no_password():
filename = os.path.join(testdir, 'health_protected.pdf')
message = 'file has not been decrypted'
with pytest.raises(Exception, message=message):
tables = camelot.read_pdf(filename)


def test_bad_password():
filename = os.path.join(testdir, 'health_protected.pdf')
message = 'file has not been decrypted'
with pytest.raises(Exception, message=message):
tables = camelot.read_pdf(filename, password='wrongpass')

0 comments on commit 429640f

Please sign in to comment.