[MRG + 1] Add basic support for encrypted PDF files (#180)

* [MRG] Add basic support for encrypted PDF files Update API and CLI to accept ASCII passwords to decrypt PDFs encrypted by algorithm code 1 or 2 (limited by support from PyPDF2). Update documentation and unit tests accordingly. Example document health_protected.pdf generated as follows: qpdf --encrypt userpass ownerpass 128 -- health.pdf health_protected.pdf Issue #162 * Support encrypted PDF files in python3 Issue #162 * Address review comments Explicitly check passwords for None rather than falsey. Correct read_pdf documentation for Owner/User password. Issue #162 * Correct API documentation changes for consistency Issue #162 * Move error tests from test_common to test_errors Issue #162 * Add qpdf example * Remove password is not None check * Fix merge conflict * Fix pages example
atlanhq · Oct 28, 2018 · 429640f · 429640f
1 parent 4366313
commit 429640f
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 30 deletions.
diff --git a/camelot/cli.py b/camelot/cli.py
@@ -27,6 +27,7 @@ def set_config(self, key, value):
 @click.version_option(version=__version__)
 @click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
  ' Example: 1,3,4 or 1,4-end.')
+@click.option('-pw', '--password', help='Password for decryption.')
 @click.option('-o', '--output', help='Output file path.')
 @click.option('-f', '--format',
  type=click.Choice(['csv', 'json', 'excel', 'html']),

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import os
+import sys
 
 from PyPDF2 import PdfFileReader, PdfFileWriter
 
@@ -21,14 +22,22 @@ class PDFHandler(object):
  Path to PDF file.
  pages : str, optional (default: '1')
  Comma-separated page numbers.
- Example: 1,3,4 or 1,4-end.
+ Example: '1,3,4' or '1,4-end'.
+ password : str, optional (default: None)
+ Password for decryption.
 
  """
- def __init__(self, filename, pages='1'):
+ def __init__(self, filename, pages='1', password=None):
  self.filename = filename
  if not filename.lower().endswith('.pdf'):
  raise NotImplementedError("File format not supported")
  self.pages = self._get_pages(self.filename, pages)
+ if password is None:
+ self.password = ''
+ else:
+ self.password = password
+ if sys.version_info[0] < 3:
+ self.password = self.password.encode('ascii')
 
  def _get_pages(self, filename, pages):
  """Converts pages string to list of ints.
@@ -52,6 +61,8 @@ def _get_pages(self, filename, pages):
  page_numbers.append({'start': 1, 'end': 1})
  else:
  infile = PdfFileReader(open(filename, 'rb'), strict=False)
+ if infile.isEncrypted:
+ infile.decrypt(self.password)
  if pages == 'all':
  page_numbers.append({'start': 1, 'end': infile.getNumPages()})
  else:
@@ -84,7 +95,7 @@ def _save_page(self, filename, page, temp):
  with open(filename, 'rb') as fileobj:
  infile = PdfFileReader(fileobj, strict=False)
  if infile.isEncrypted:
- infile.decrypt('')
+ infile.decrypt(self.password)
  fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
  froot, fext = os.path.splitext(fpath)
  p = infile.getPage(page - 1)
@@ -103,7 +114,7 @@ def _save_page(self, filename, page, temp):
  os.rename(fpath, fpath_new)
  infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
  if infile.isEncrypted:
- infile.decrypt('')
+ infile.decrypt(self.password)
  outfile = PdfFileWriter()
  p = infile.getPage(0)
  if rotation == 'anticlockwise':

diff --git a/camelot/io.py b/camelot/io.py
@@ -5,8 +5,8 @@
 from .utils import validate_input, remove_extra
 
 
-def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
- **kwargs):
+def read_pdf(filepath, pages='1', password=None, flavor='lattice',
+ suppress_warnings=False, **kwargs):
  """Read PDF and return extracted tables.
 
  Note: kwargs annotated with ^ can only be used with flavor='stream'
@@ -19,6 +19,8 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
  pages : str, optional (default: '1')
  Comma-separated page numbers.
  Example: '1,3,4' or '1,4-end'.
+ password : str, optional (default: None)
+ Password for decryption.
  flavor : str (default: 'lattice')
  The parsing method to use ('lattice' or 'stream').
  Lattice is used by default.
@@ -94,7 +96,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
  warnings.simplefilter("ignore")
 
  validate_input(kwargs, flavor=flavor)
- p = PDFHandler(filepath, pages)
+ p = PDFHandler(filepath, pages=pages, password=password)
  kwargs = remove_extra(kwargs, flavor=flavor)
  tables = p.parse(flavor=flavor, **kwargs)
  return tables
diff --git a/docs/user/cli.rst b/docs/user/cli.rst
@@ -9,26 +9,28 @@ You can print the help for the interface by typing ``camelot --help`` in your fa
 
 ::
 
- Usage: camelot [OPTIONS] COMMAND [ARGS]...
+Usage: camelot [OPTIONS] COMMAND [ARGS]...
 
  Camelot: PDF Table Extraction for Humans
 
- Options:
- --version Show the version and exit.
- -p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
- or 1,4-end.
- -o, --output TEXT Output file path.
- -f, --format [csv|json|excel|html]
- Output file format.
- -z, --zip Create ZIP archive.
- -split, --split_text Split text that spans across multiple cells.
- -flag, --flag_size Flag text based on font size. Useful to
- detect super/subscripts.
- -M, --margins <FLOAT FLOAT FLOAT>...
- PDFMiner char_margin, line_margin and
- word_margin.
- --help Show this message and exit.
-
- Commands:
- lattice Use lines between text to parse the table.
- stream Use spaces between text to parse the table.
+Options:
+ --version Show the version and exit.
+ -p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
+ or 1,4-end.
+ -pw, --password TEXT Password for decryption.
+ -o, --output TEXT Output file path.
+ -f, --format [csv|json|excel|html]
+ Output file format.
+ -z, --zip Create ZIP archive.
+ -split, --split_text Split text that spans across multiple cells.
+ -flag, --flag_size Flag text based on font size. Useful to
+ detect super/subscripts.
+ -M, --margins <FLOAT FLOAT FLOAT>...
+ PDFMiner char_margin, line_margin and
+ word_margin.
+ -q, --quiet Suppress warnings.
+ --help Show this message and exit.
+
+Commands:
+ lattice Use lines between text to parse the table.
+ stream Use spaces between text to parse the table.
diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst
@@ -87,6 +87,28 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp
 
 The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.
 
-------------------------
+Reading encrypted PDFs
+----------------------
 
-Ready for more? Check out the :ref:`advanced <advanced>` section.
+To extract tables from encrypted PDF files you must provide a password when calling :meth:`read_pdf() <camelot.read_pdf>`.
+
+::
+
+ >>> tables = camelot.read_pdf('foo.pdf', password='userpass')
+ >>> tables
+ <TableList n=1>
+
+Currently Camelot only supports PDFs encrypted with ASCII passwords and algorithm `code 1 or 2`_. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm.
+
+Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() <camelot.read_pdf>`. This can been successfully achieved with third-party tools such as `QPDF`_.
+
+::
+
+ $ qpdf --password=<PASSWORD> --decrypt input.pdf output.pdf
+
+.. _code 1 or 2: https://github.com/mstamy2/PyPDF2/issues/378
+.. _QPDF: https://www.github.com/qpdf/qpdf
+
+----
+
+Ready for more? Check out the :ref:`advanced <advanced>` section.
diff --git a/tests/files/health_protected.pdf b/tests/files/health_protected.pdf
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -52,6 +52,30 @@ def test_cli_stream():
  assert format_error in result.output
 
 
+def test_cli_password():
+ with TemporaryDirectory() as tempdir:
+ infile = os.path.join(testdir, 'health_protected.pdf')
+ outfile = os.path.join(tempdir, 'health_protected.csv')
+ runner = CliRunner()
+ result = runner.invoke(cli, ['--password', 'userpass',
+ '--format', 'csv', '--output', outfile,
+ 'stream', infile])
+ assert result.exit_code == 0
+ assert result.output == 'Found 1 tables\n'
+
+ output_error = 'file has not been decrypted'
+ # no password
+ result = runner.invoke(cli, ['--format', 'csv', '--output', outfile,
+ 'stream', infile])
+ assert output_error in str(result.exception)
+
+ # bad password
+ result = runner.invoke(cli, ['--password', 'wrongpass',
+ '--format', 'csv', '--output', outfile,
+ 'stream', infile])
+ assert output_error in str(result.exception)
+
+
 def test_cli_output_format():
  with TemporaryDirectory() as tempdir:
  infile = os.path.join(testdir, 'health.pdf')
@@ -78,7 +102,7 @@ def test_cli_output_format():
  'stream', infile])
  assert result.exit_code == 0
 
-def test_cli_quiet_flag():
+def test_cli_quiet():
  with TemporaryDirectory() as tempdir:
  infile = os.path.join(testdir, 'blank.pdf')
  outfile = os.path.join(tempdir, 'blank.csv')

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -25,6 +25,17 @@ def test_parsing_report():
  assert tables[0].parsing_report == parsing_report
 
 
+def test_password():
+ df = pd.DataFrame(data_stream)
+
+ filename = os.path.join(testdir, "health_protected.pdf")
+ tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
+ assert df.equals(tables[0].df)
+
+ tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
+ assert df.equals(tables[0].df)
+
+
 def test_stream():
  df = pd.DataFrame(data_stream)
 

diff --git a/tests/test_errors.py b/tests/test_errors.py
@@ -75,3 +75,17 @@ def _find_executable_patch(arg):
  filename = os.path.join(testdir, 'foo.pdf')
  with pytest.raises(Exception, message=message):
  tables = camelot.read_pdf(filename)
+
+
+def test_no_password():
+ filename = os.path.join(testdir, 'health_protected.pdf')
+ message = 'file has not been decrypted'
+ with pytest.raises(Exception, message=message):
+ tables = camelot.read_pdf(filename)
+
+
+def test_bad_password():
+ filename = os.path.join(testdir, 'health_protected.pdf')
+ message = 'file has not been decrypted'
+ with pytest.raises(Exception, message=message):
+ tables = camelot.read_pdf(filename, password='wrongpass')