atlanhq · vinayak-mehta · Dec 24, 2018 · Dec 24, 2018 · Dec 24, 2018 · Dec 24, 2018
diff --git a/HISTORY.md b/HISTORY.md
@@ -6,6 +6,7 @@ master
 
 **Improvements**
 
+* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
 * [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
     * `strip_text`: To define characters that should be stripped from each string.
     * `edge_tol`: Tolerance parameter for extending textedges vertically.

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -8,7 +8,7 @@
 from .core import TableList
 from .parsers import Stream, Lattice
 from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
-                    get_rotation)
+                    get_rotation, is_url, download_url)
 
 
 class PDFHandler(object):
@@ -18,34 +18,36 @@ class PDFHandler(object):
 
     Parameters
     ----------
-    filename : str
-        Path to PDF file.
+    filepath : str
+        Filepath or URL of the PDF file.
     pages : str, optional (default: '1')
         Comma-separated page numbers.
         Example: '1,3,4' or '1,4-end'.
     password : str, optional (default: None)
         Password for decryption.
 
     """
-    def __init__(self, filename, pages='1', password=None):
-        self.filename = filename
-        if not filename.lower().endswith('.pdf'):
+    def __init__(self, filepath, pages='1', password=None):
+        if is_url(filepath):
+            filepath = download_url(filepath)
+        self.filepath = filepath
+        if not filepath.lower().endswith('.pdf'):
             raise NotImplementedError("File format not supported")
-        self.pages = self._get_pages(self.filename, pages)
+        self.pages = self._get_pages(self.filepath, pages)
         if password is None:
             self.password = ''
         else:
             self.password = password
             if sys.version_info[0] < 3:
                 self.password = self.password.encode('ascii')
 
-    def _get_pages(self, filename, pages):
+    def _get_pages(self, filepath, pages):
         """Converts pages string to list of ints.
 
         Parameters
         ----------
-        filename : str
-            Path to PDF file.
+        filepath : str
+            Filepath or URL of the PDF file.
         pages : str, optional (default: '1')
             Comma-separated page numbers.
             Example: 1,3,4 or 1,4-end.
@@ -60,7 +62,7 @@ def _get_pages(self, filename, pages):
         if pages == '1':
             page_numbers.append({'start': 1, 'end': 1})
         else:
-            infile = PdfFileReader(open(filename, 'rb'), strict=False)
+            infile = PdfFileReader(open(filepath, 'rb'), strict=False)
             if infile.isEncrypted:
                 infile.decrypt(self.password)
             if pages == 'all':
@@ -79,20 +81,20 @@ def _get_pages(self, filename, pages):
             P.extend(range(p['start'], p['end'] + 1))
         return sorted(set(P))
 
-    def _save_page(self, filename, page, temp):
+    def _save_page(self, filepath, page, temp):
         """Saves specified page from PDF into a temporary directory.
 
         Parameters
         ----------
-        filename : str
-            Path to PDF file.
+        filepath : str
+            Filepath or URL of the PDF file.
         page : int
             Page number.
         temp : str
             Tmp directory.
 
         """
-        with open(filename, 'rb') as fileobj:
+        with open(filepath, 'rb') as fileobj:
             infile = PdfFileReader(fileobj, strict=False)
             if infile.isEncrypted:
                 infile.decrypt(self.password)
@@ -150,7 +152,7 @@ def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwa
         tables = []
         with TemporaryDirectory() as tempdir:
             for p in self.pages:
-                self._save_page(self.filename, p, tempdir)
+                self._save_page(self.filepath, p, tempdir)
             pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
                      for p in self.pages]
             parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)

diff --git a/camelot/io.py b/camelot/io.py
@@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
     Parameters
     ----------
     filepath : str
-        Path to PDF file.
+        Filepath or URL of the PDF file.
     pages : str, optional (default: '1')
         Comma-separated page numbers.
         Example: '1,3,4' or '1,4-end'.

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -1,12 +1,17 @@
+# -*- coding: utf-8 -*-
 from __future__ import division
+
+import os
+import sys
+import random
 import shutil
+import string
 import tempfile
 import warnings
 from itertools import groupby
 from operator import itemgetter
 
 import numpy as np
-
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@@ -18,6 +23,77 @@
                              LTTextLineVertical)
 
 
+PY3 = sys.version_info[0] >= 3
+if PY3:
+    from urllib.request import urlopen
+    from urllib.parse import urlparse as parse_url
+    from urllib.parse import uses_relative, uses_netloc, uses_params
+else:
+    from urllib2 import urlopen
+    from urlparse import urlparse as parse_url
+    from urlparse import uses_relative, uses_netloc, uses_params
+
+
+_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
+_VALID_URLS.discard('')
+
+
+# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
+def is_url(url):
+    """Check to see if a URL has a valid protocol.
+
+    Parameters
+    ----------
+    url : str or unicode
+
+    Returns
+    -------
+    isurl : bool
+        If url has a valid protocol return True otherwise False.
+
+    """
+    try:
+        return parse_url(url).scheme in _VALID_URLS
+    except Exception:
+        return False
+
+
+def random_string(length):
+    ret = ''
+    while length:
+        ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
+        length -= 1
+    return ret
+
+
+def download_url(url):
+    """Download file from specified URL.
+
+    Parameters
+    ----------
+    url : str or unicode
+
+    Returns
+    -------
+    filepath : str or unicode
+        Temporary filepath.
+
+    """
+    filename = '{}.pdf'.format(random_string(6))
+    with tempfile.NamedTemporaryFile('wb', delete=False) as f:
+        obj = urlopen(url)
+        if PY3:
+            content_type = obj.info().get_content_type()
+        else:
+            content_type = obj.info().getheader('Content-Type')
+        if content_type != 'application/pdf':
+            raise NotImplementedError("File format not supported")
+        f.write(obj.read())
+    filepath = os.path.join(os.path.dirname(f.name), filename)
+    shutil.move(f.name, filepath)
+    return filepath
+
+
 stream_kwargs = [
     'columns',
     'row_tol',

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -207,6 +207,14 @@ def test_repr():
     assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
 
 
+def test_url():
+    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
+    tables = camelot.read_pdf(url)
+    assert repr(tables) == "<TableList n=1>"
+    assert repr(tables[0]) == "<Table shape=(7, 7)>"
+    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+
+
 def test_arabic():
     df = pd.DataFrame(data_arabic)