Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Add read support for Google Cloud Storage #20729

Merged
merged 3 commits into from
Jun 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/appveyor-27.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ dependencies:
- beautifulsoup4
- bottleneck
- dateutil
- gcsfs
- html5lib
- jinja2=2.8
- lxml
Expand Down
1 change: 1 addition & 0 deletions ci/check_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

blacklist = {
'bs4',
'gcsfs',
'html5lib',
'ipython',
'jinja2'
Expand Down
1 change: 1 addition & 0 deletions ci/circle-36-locale_slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
dependencies:
- beautifulsoup4
- cython
- gcsfs
- html5lib
- ipython
- jinja2
Expand Down
1 change: 1 addition & 0 deletions ci/requirements-optional-conda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ blosc
bottleneck
fastparquet
feather-format
gcsfs
html5lib
ipython>=5.6.0
ipykernel
Expand Down
1 change: 1 addition & 0 deletions ci/requirements-optional-pip.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ blosc
bottleneck
fastparquet
feather-format
gcsfs
html5lib
ipython>=5.6.0
ipykernel
Expand Down
1 change: 1 addition & 0 deletions ci/travis-27.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- fastparquet
- feather-format
- flake8=3.4.1
- gcsfs
- html5lib
- ipython
- jemalloc=4.5.0.post
Expand Down
1 change: 1 addition & 0 deletions ci/travis-36.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- dask
- fastparquet
- feather-format
- gcsfs
- geopandas
- html5lib
- ipython
Expand Down
1 change: 1 addition & 0 deletions doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ Optional Dependencies
* `Jinja2 <http://jinja.pocoo.org/>`__: Template engine for conditional HTML formatting.
* `s3fs <http://s3fs.readthedocs.io/>`__: necessary for Amazon S3 access (s3fs >= 0.0.7).
* `blosc <https://pypi.org/project/blosc>`__: for msgpack compression using ``blosc``
* `gcsfs <http://gcsfs.readthedocs.io/>`__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0).
* One of
`qtpy <https://github.com/spyder-ide/qtpy>`__ (requires PyQt or PySide),
`PyQt5 <https://www.riverbankcomputing.com/software/pyqt/download5>`__,
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Other Enhancements
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`)

- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`)

.. _whatsnew_0240.api_breaking:

Expand Down
16 changes: 16 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import importlib

import pytest

import numpy as np
Expand Down Expand Up @@ -249,3 +251,17 @@ def any_int_dtype(request):
"""

return request.param


@pytest.fixture
def mock():
"""
Fixture providing the 'mock' module.

Uses 'unittest.mock' for Python 3. Attempts to import the 3rd party 'mock'
package for Python 2, skipping if not present.
"""
if PY3:
return importlib.import_module("unittest.mock")
else:
return pytest.importorskip("mock")
19 changes: 17 additions & 2 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _is_url(url):
"""
try:
return parse_url(url).scheme in _VALID_URLS
except:
except Exception:
return False


Expand Down Expand Up @@ -165,7 +165,15 @@ def is_s3_url(url):
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ['s3', 's3n', 's3a']
except: # noqa
except Exception:
return False


def is_gcs_url(url):
"""Check for a gcs url"""
try:
return parse_url(url).scheme in ['gcs', 'gs']
except Exception:
return False


Expand Down Expand Up @@ -208,6 +216,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=compression,
mode=mode)

if is_gcs_url(filepath_or_buffer):
from pandas.io import gcs
return gcs.get_filepath_or_buffer(filepath_or_buffer,
encoding=encoding,
compression=compression,
mode=mode)

if isinstance(filepath_or_buffer, (compat.string_types,
compat.binary_type,
mmap.mmap)):
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
io : string, path object (pathlib.Path or py._path.local.LocalPath),
file-like object, pandas ExcelFile, or xlrd workbook.
The string could be a URL. Valid URL schemes include http, ftp, s3,
and file. For file URLs, a host is expected. For instance, a local
gcs, and file. For file URLs, a host is expected. For instance, a local
file could be file://localhost/path/to/workbook.xlsx
sheet_name : string, int, mixed list of strings/ints, or None, default 0

Expand Down
16 changes: 16 additions & 0 deletions pandas/io/gcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
""" GCS support for remote file interactivity """
try:
import gcsfs
except ImportError:
raise ImportError("The gcsfs library is required to handle GCS files")


def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):

if mode is None:
mode = 'rb'

fs = gcsfs.GCSFileSystem()
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bnaul do you think this is an appropriate place to mock? fs.open could return a BytesIO object?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a simple mock-based test here; any other methods I should include besides read_csv?

return filepath_or_buffer, None, compression, True
6 changes: 3 additions & 3 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
Parameters
----------
path_or_buf : a valid JSON string or file-like, default: None
The string could be a URL. Valid URL schemes include http, ftp, s3, and
file. For file URLs, a host is expected. For instance, a local file
could be ``file://localhost/path/to/table.json``
The string could be a URL. Valid URL schemes include http, ftp, s3,
gcs, and file. For file URLs, a host is expected. For instance, a local
file could be ``file://localhost/path/to/table.json``
orient : string,
Indication of expected JSON string format.
Expand Down
9 changes: 3 additions & 6 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
DatetimeIndex, TimedeltaIndex, Timestamp,
Panel, Period, Categorical, isna, Interval,
DateOffset)
from pandas.compat import u, PY2, PY3, StringIO, lrange
from pandas.compat import u, PY2, StringIO, lrange
from pandas.core.dtypes import inference
from pandas.core.dtypes.common import (
is_timedelta64_dtype,
Expand Down Expand Up @@ -128,7 +128,7 @@ def test_is_dict_like_fails(ll):
assert not inference.is_dict_like(ll)


def test_is_file_like():
def test_is_file_like(mock):
class MockFile(object):
pass

Expand Down Expand Up @@ -166,10 +166,7 @@ class MockFile(object):
# Iterator but no read / write attributes
data = [1, 2, 3]
assert not is_file(data)

if PY3:
from unittest import mock
assert not is_file(mock.Mock())
assert not is_file(mock.Mock())


@pytest.mark.parametrize(
Expand Down
9 changes: 3 additions & 6 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1546,7 +1546,7 @@ def test_file_handles(self):
assert not m.closed
m.close()

def test_invalid_file_buffer(self):
def test_invalid_file_buffer(self, mock):
# see gh-15337

class InvalidBuffer(object):
Expand Down Expand Up @@ -1577,11 +1577,8 @@ def seek(self, pos, whence=0):

tm.assert_frame_equal(result, expected)

if PY3:
from unittest import mock

with tm.assert_raises_regex(ValueError, msg):
self.read_csv(mock.Mock())
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(mock.Mock())

@tm.capture_stderr
def test_skip_bad_lines(self):
Expand Down
47 changes: 47 additions & 0 deletions pandas/tests/io/test_gcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import numpy as np
import pytest

from pandas import DataFrame, date_range, read_csv
from pandas.compat import StringIO
from pandas.io.common import is_gcs_url
from pandas.util import _test_decorators as td
from pandas.util.testing import assert_frame_equal


def test_is_gcs_url():
assert is_gcs_url("gcs://pandas/somethingelse.com")
assert is_gcs_url("gs://pandas/somethingelse.com")
assert not is_gcs_url("s3://pandas/somethingelse.com")


@td.skip_if_no('gcsfs')
def test_read_csv_gcs(mock):
df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
'dt': date_range('2018-06-18', periods=2)})
with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem:
instance = MockFileSystem.return_value
instance.open.return_value = StringIO(df1.to_csv(index=False))
df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])

assert_frame_equal(df1, df2)


@td.skip_if_no('gcsfs')
def test_gcs_get_filepath_or_buffer(mock):
df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
'dt': date_range('2018-06-18', periods=2)})
with mock.patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath:
MockGetFilepath.return_value = (StringIO(df1.to_csv(index=False)),
None, None, False)
df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])

assert_frame_equal(df1, df2)
assert MockGetFilepath.called


@pytest.mark.skipif(td.safe_import('gcsfs'),
reason='Only check when gcsfs not installed')
def test_gcs_not_present_exception():
with pytest.raises(ImportError) as e:
read_csv('gs://test/test.csv')
assert 'gcsfs library is required' in str(e.value)
1 change: 1 addition & 0 deletions pandas/util/_print_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def show_versions(as_json=False):
("fastparquet", lambda mod: mod.__version__),
("pandas_gbq", lambda mod: mod.__version__),
("pandas_datareader", lambda mod: mod.__version__),
("gcsfs", lambda mod: mod.__version__),
]

deps_blob = list()
Expand Down