Skip to content

Commit

Permalink
Google Cloud Storage support using gcsfs
Browse files Browse the repository at this point in the history
  • Loading branch information
bnaul committed May 22, 2018
1 parent 172ab7a commit b11b2f8
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 7 deletions.
1 change: 1 addition & 0 deletions ci/check_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

blacklist = {
'bs4',
'gcsfs',
'html5lib',
'ipython',
'jinja2'
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-optional-pip.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ sqlalchemy
xarray
xlrd
xlsxwriter
xlwt
xlwt
1 change: 1 addition & 0 deletions doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ Optional Dependencies
* `Jinja2 <http://jinja.pocoo.org/>`__: Template engine for conditional HTML formatting.
* `s3fs <http://s3fs.readthedocs.io/>`__: necessary for Amazon S3 access (s3fs >= 0.0.7).
* `blosc <https://pypi.org/project/blosc>`__: for msgpack compression using ``blosc``
* `gcsfs <http://gcsfs.readthedocs.io/>`__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0).
* One of
`qtpy <https://github.com/spyder-ide/qtpy>`__ (requires PyQt or PySide),
`PyQt5 <https://www.riverbankcomputing.com/software/pyqt/download5>`__,
Expand Down
19 changes: 17 additions & 2 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _is_url(url):
"""
try:
return parse_url(url).scheme in _VALID_URLS
except:
except Exception:
return False


Expand Down Expand Up @@ -165,7 +165,15 @@ def is_s3_url(url):
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ['s3', 's3n', 's3a']
except: # noqa
except Exception:
return False


def is_gcs_url(url):
"""Check for a gcs url"""
try:
return parse_url(url).scheme in ['gcs', 'gs']
except Exception:
return False


Expand Down Expand Up @@ -208,6 +216,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=compression,
mode=mode)

if is_gcs_url(filepath_or_buffer):
from pandas.io import gcs
return gcs.get_filepath_or_buffer(filepath_or_buffer,
encoding=encoding,
compression=compression,
mode=mode)

if isinstance(filepath_or_buffer, (compat.string_types,
compat.binary_type,
mmap.mmap)):
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
io : string, path object (pathlib.Path or py._path.local.LocalPath),
file-like object, pandas ExcelFile, or xlrd workbook.
The string could be a URL. Valid URL schemes include http, ftp, s3,
and file. For file URLs, a host is expected. For instance, a local
gcs, and file. For file URLs, a host is expected. For instance, a local
file could be file://localhost/path/to/workbook.xlsx
sheet_name : string, int, mixed list of strings/ints, or None, default 0
Expand Down
16 changes: 16 additions & 0 deletions pandas/io/gcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
""" GCS support for remote file interactivity """
try:
import gcsfs
except ImportError:
raise ImportError("The gcsfs library is required to handle GCS files")


def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):

if mode is None:
mode = 'rb'

fs = gcsfs.GCSFileSystem()
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
return filepath_or_buffer, None, compression, True
6 changes: 3 additions & 3 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
Parameters
----------
path_or_buf : a valid JSON string or file-like, default: None
The string could be a URL. Valid URL schemes include http, ftp, s3, and
file. For file URLs, a host is expected. For instance, a local file
could be ``file://localhost/path/to/table.json``
The string could be a URL. Valid URL schemes include http, ftp, s3,
gcs, and file. For file URLs, a host is expected. For instance, a local
file could be ``file://localhost/path/to/table.json``
orient : string,
Indication of expected JSON string format.
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/io/test_gcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pandas import DataFrame, read_csv
from pandas.compat import BytesIO
from pandas.io.common import is_gcs_url


class TestGCSURL(object):

def test_is_gcs_url(self):
assert is_gcs_url("gcs://pandas/somethingelse.com")
assert is_gcs_url("gs://pandas/somethingelse.com")
assert not is_gcs_url("s3://pandas/somethingelse.com")

def test_read_csv_gcs(self):
try:
from unittest.mock import patch
except ImportError:
from mock import patch

with patch('gcsfs.GCSFileSystem') as MockFileSystem:
instance = MockFileSystem.return_value
instance.open.return_value = BytesIO(b'a,b\n1,2\n3,4')
df = read_csv('gs://test/test.csv')

assert isinstance(df, DataFrame)
assert len(df == 2)
assert all(df.columns == ['a', 'b'])
1 change: 1 addition & 0 deletions pandas/util/_print_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def show_versions(as_json=False):
("fastparquet", lambda mod: mod.__version__),
("pandas_gbq", lambda mod: mod.__version__),
("pandas_datareader", lambda mod: mod.__version__),
("gcsfs", lambda mod: mod.__version__),
]

deps_blob = list()
Expand Down

0 comments on commit b11b2f8

Please sign in to comment.