Google Cloud Storage (GCS) (#404)

* GCS support working * Start updating setup.py and README * Add .idea/ to .gitignore * Move integration tests to their proper location * All unit tests passing with mocks * Add gcs integration tests * Add .env to .gitignore * Integration tests passing * Remove kms integration test * Fix failing test * Change writer from buffer size to min part size * Fix typo * Change buffer size to min part size * Fix another missed min_part_size * Fix broken example * Change smart_open to open * Make WHENCE_CHOICES a tuple * Specify client type * Make init one line instead of multiple * Shorten _parse_uri_gcs * Use open instead of smart_open * Fix memory issue with RawReader * Fix type hints * Fix exists in test * Fix typo in smart_open_lib about schemes * Remove RawReaders and BufferedInputBase * Remove unneeded arg passing in tests * Fix bug with seek * Internal and doc stringed constants * Internal functions and doc strings * Fix issue with double issue on upload and add __str__ and __repr__ * Minor cleanup * Fix flake8 errors * Additional flake8 resolution * Add source code encoding to test file * Docstrings in imperative mode * Test grammar * Fix mock_gcs docstring * Only support gs scheme * Remove additional occurences of removed gcs scheme * Add test_read_past_end * Clean up tests with class level decorator * Remove stub function * Use equality instead of in for scheme * Fix repr and mock import * Specify ImportError * Use BytesIO in integration test * Explicit encoding in integration tests * Remove unneeded variable in integration test Co-Authored-By: Michael Penkov <m@penkov.dev> * Remove unneeded data variable Co-Authored-By: Michael Penkov <m@penkov.dev> * Remove .format from test_gcs * Move RESUMEABLE_SESSION_URI_TEMPLATE to module scope * Remove unnecessary explicit encoding Co-Authored-By: Michael Penkov <m@penkov.dev> * Remove unnecessary variable in read Co-Authored-By: Michael Penkov <m@penkov.dev> * Change assertion to use _REQUIRED_CHUNK_MULTIPLE Co-Authored-By: Michael Penkov <m@penkov.dev> * Import clamp from s3 * Add comment on buffering being read-only * Fix client docstring * Make SeekableRawReader internal * Add return value to seek in _SeekableRawReader * Allow integration test to take a prefix * Add doc to NotFound exception * Fix misleading log statement * Various formatting changes * Add additional assertion for min_part_size * Clean up _upload_next_part * Improve UploadFailedError * Add docstring to terminate * Add additional clean up to close * Remove useless terminate in SeekableBufferedInputBase * Fix data type for status_code in UploadFailedError * Clean up UploadedFailedError msg * Start on mock tests * Add tests for mocks * Clean up registering dependencies * Get initialize_bucket to work without gsutil * Change buffering to buffer_size * Add copyright header, fix logging styles, and move result outside ctx manager * Change _upload_empty_part to debug msg * Clean up patching style * Add tests for smart_open_lib * Add blank lines before constants to help readability * Add missing clean up of raw_reader in close * Remove aws related credentials from gs tests Co-authored-by: Michael Penkov <m@penkov.dev>
piskvorky · Jan 24, 2020 · a9aa466 · a9aa466
1 parent a621aeb
commit a9aa466
Show file tree

Hide file tree

Showing 8 changed files with 1,550 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -56,3 +56,9 @@ target/
 # vim
 *.swp
 *.swo
+
+# PyCharm
+.idea/
+
+# env files
+.env
diff --git a/README.rst b/README.rst
@@ -12,7 +12,7 @@ smart_open — utils for streaming large files in Python
 What?
 =====
 
-``smart_open`` is a Python 2 & Python 3 library for **efficient streaming of very large files** from/to storages such as S3, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats.
+``smart_open`` is a Python 2 & Python 3 library for **efficient streaming of very large files** from/to storages such as S3, GCS, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats.
 
 ``smart_open`` is a drop-in replacement for Python's built-in ``open()``: it can do anything ``open`` can (100% compatible, falls back to native ``open`` wherever possible), plus lots of nifty extra stuff on top.
 
@@ -80,6 +80,7 @@ Other examples of URLs that ``smart_open`` accepts::
     s3://my_bucket/my_key
     s3://my_key:my_secret@my_bucket/my_key
     s3://my_key:my_secret@my_server:my_port@my_bucket/my_key
+    gs://my_bucket/my_blob
     hdfs:///path/file
     hdfs://path/file
     webhdfs://host:port/path/file
@@ -174,6 +175,14 @@ More examples
     with open('s3://bucket/key.txt', 'wb', transport_params=transport_params) as fout:
         fout.write(b'here we stand')
 
+    # stream from GCS
+    for line in open('gs://my_bucket/my_file.txt'):
+        print(line)
+
+    # stream content *into* GCS (write mode):
+    with open('gs://my_bucket/my_file.txt', 'wb') as fout:
+        fout.write(b'hello world')
+
 Supported Compression Formats
 -----------------------------
 
@@ -212,6 +221,7 @@ Transport-specific Options
 - HTTP, HTTPS (read-only)
 - SSH, SCP and SFTP
 - WebHDFS
+- GCS
 
 Each option involves setting up its own set of parameters.
 For example, for accessing S3, you often need to set up authentication, like API keys or a profile name.

diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+import io
+import os
+
+import google.cloud.storage
+from six.moves.urllib import parse as urlparse
+
+import smart_open
+
+_GCS_URL = os.environ.get('SO_GCS_URL')
+assert _GCS_URL is not None, 'please set the SO_GCS_URL environment variable'
+
+
+def initialize_bucket():
+    client = google.cloud.storage.Client()
+    parsed = urlparse.urlparse(_GCS_URL)
+    bucket_name = parsed.netloc
+    prefix = parsed.path
+    bucket = client.get_bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=prefix)
+    for blob in blobs:
+        blob.delete()
+
+
+def write_read(key, content, write_mode, read_mode, **kwargs):
+    with smart_open.open(key, write_mode, **kwargs) as fout:
+        fout.write(content)
+    with smart_open.open(key, read_mode, **kwargs) as fin:
+        return fin.read()
+
+
+def read_length_prefixed_messages(key, read_mode, **kwargs):
+    result = io.BytesIO()
+
+    with smart_open.open(key, read_mode, **kwargs) as fin:
+        length_byte = fin.read(1)
+        while len(length_byte):
+            result.write(length_byte)
+            msg = fin.read(ord(length_byte))
+            result.write(msg)
+            length_byte = fin.read(1)
+    return result.getvalue()
+
+
+def test_gcs_readwrite_text(benchmark):
+    initialize_bucket()
+
+    key = _GCS_URL + '/sanity.txt'
+    text = 'с гранатою в кармане, с чекою в руке'
+    actual = benchmark(write_read, key, text, 'w', 'r', encoding='utf-8')
+    assert actual == text
+
+
+def test_gcs_readwrite_text_gzip(benchmark):
+    initialize_bucket()
+
+    key = _GCS_URL + '/sanity.txt.gz'
+    text = 'не чайки здесь запели на знакомом языке'
+    actual = benchmark(write_read, key, text, 'w', 'r', encoding='utf-8')
+    assert actual == text
+
+
+def test_gcs_readwrite_binary(benchmark):
+    initialize_bucket()
+
+    key = _GCS_URL + '/sanity.txt'
+    binary = b'this is a test'
+    actual = benchmark(write_read, key, binary, 'wb', 'rb')
+    assert actual == binary
+
+
+def test_gcs_readwrite_binary_gzip(benchmark):
+    initialize_bucket()
+
+    key = _GCS_URL + '/sanity.txt.gz'
+    binary = b'this is a test'
+    actual = benchmark(write_read, key, binary, 'wb', 'rb')
+    assert actual == binary
+
+
+def test_gcs_performance(benchmark):
+    initialize_bucket()
+
+    one_megabyte = io.BytesIO()
+    for _ in range(1024*128):
+        one_megabyte.write(b'01234567')
+    one_megabyte = one_megabyte.getvalue()
+
+    key = _GCS_URL + '/performance.txt'
+    actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb')
+    assert actual == one_megabyte
+
+
+def test_gcs_performance_gz(benchmark):
+    initialize_bucket()
+
+    one_megabyte = io.BytesIO()
+    for _ in range(1024*128):
+        one_megabyte.write(b'01234567')
+    one_megabyte = one_megabyte.getvalue()
+
+    key = _GCS_URL + '/performance.txt.gz'
+    actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb')
+    assert actual == one_megabyte
+
+
+def test_gcs_performance_small_reads(benchmark):
+    initialize_bucket()
+
+    ONE_MIB = 1024**2
+    one_megabyte_of_msgs = io.BytesIO()
+    msg = b'\x0f' + b'0123456789abcde'  # a length-prefixed "message"
+    for _ in range(0, ONE_MIB, len(msg)):
+        one_megabyte_of_msgs.write(msg)
+    one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue()
+
+    key = _GCS_URL + '/many_reads_performance.bin'
+
+    with smart_open.open(key, 'wb') as fout:
+        fout.write(one_megabyte_of_msgs)
+
+    actual = benchmark(read_length_prefixed_messages, key, 'rb', buffering=ONE_MIB)
+    assert actual == one_megabyte_of_msgs
diff --git a/setup.py b/setup.py
@@ -59,14 +59,15 @@ def read(fname):
     'boto >= 2.32',
     'requests',
     'boto3',
+    'google-cloud-storage',
 ]
 if sys.version_info[0] == 2:
     install_requires.append('bz2file')
 
 setup(
     name='smart_open',
     version=__version__,
-    description='Utils for streaming large files (S3, HDFS, gzip, bz2...)',
+    description='Utils for streaming large files (S3, HDFS, GCS, gzip, bz2...)',
     long_description=read('README.rst'),
 
     packages=find_packages(),
@@ -82,7 +83,7 @@ def read(fname):
     url='https://github.com/piskvorky/smart_open',
     download_url='http://pypi.python.org/pypi/smart_open',
 
-    keywords='file streaming, s3, hdfs',
+    keywords='file streaming, s3, hdfs, gcs',
 
     license='MIT',
     platforms='any',
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,3 +56,9 @@ target/ @@
     # vim
     *.swp
     *.swo
+    # PyCharm
+    .idea/
+    # env files
+    .env