Skip to content
This repository has been archived by the owner on Jan 6, 2025. It is now read-only.

[MRG] Add support to read from url #236

Merged
merged 3 commits into from
Dec 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ master

**Improvements**

* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
* [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
* `strip_text`: To define characters that should be stripped from each string.
* `edge_tol`: Tolerance parameter for extending textedges vertically.
Expand Down
34 changes: 18 additions & 16 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .core import TableList
from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation)
get_rotation, is_url, download_url)


class PDFHandler(object):
Expand All @@ -18,34 +18,36 @@ class PDFHandler(object):

Parameters
----------
filename : str
Path to PDF file.
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'.
password : str, optional (default: None)
Password for decryption.

"""
def __init__(self, filename, pages='1', password=None):
self.filename = filename
if not filename.lower().endswith('.pdf'):
def __init__(self, filepath, pages='1', password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages)
self.pages = self._get_pages(self.filepath, pages)
if password is None:
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')

def _get_pages(self, filename, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.

Parameters
----------
filename : str
Path to PDF file.
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
Expand All @@ -60,7 +62,7 @@ def _get_pages(self, filename, pages):
if pages == '1':
page_numbers.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filename, 'rb'), strict=False)
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
Expand All @@ -79,20 +81,20 @@ def _get_pages(self, filename, pages):
P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P))

def _save_page(self, filename, page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.

Parameters
----------
filename : str
Path to PDF file.
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
temp : str
Tmp directory.

"""
with open(filename, 'rb') as fileobj:
with open(filepath, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
Expand Down Expand Up @@ -150,7 +152,7 @@ def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwa
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filename, p, tempdir)
self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
Expand Down
2 changes: 1 addition & 1 deletion camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Parameters
----------
filepath : str
Path to PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'.
Expand Down
78 changes: 77 additions & 1 deletion camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import division

import os
import sys
import random
import shutil
import string
import tempfile
import warnings
from itertools import groupby
from operator import itemgetter

import numpy as np

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
Expand All @@ -18,6 +23,77 @@
LTTextLineVertical)


PY3 = sys.version_info[0] >= 3
if PY3:
from urllib.request import urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params
else:
from urllib2 import urlopen
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params


_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')


# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.

Parameters
----------
url : str or unicode

Returns
-------
isurl : bool
If url has a valid protocol return True otherwise False.

"""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False


def random_string(length):
ret = ''
while length:
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
length -= 1
return ret


def download_url(url):
"""Download file from specified URL.

Parameters
----------
url : str or unicode

Returns
-------
filepath : str or unicode
Temporary filepath.

"""
filename = '{}.pdf'.format(random_string(6))
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
obj = urlopen(url)
if PY3:
content_type = obj.info().get_content_type()
else:
content_type = obj.info().getheader('Content-Type')
if content_type != 'application/pdf':
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath


stream_kwargs = [
'columns',
'row_tol',
Expand Down
8 changes: 8 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,14 @@ def test_repr():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"


def test_url():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"


def test_arabic():
df = pd.DataFrame(data_arabic)

Expand Down