From 6eeb5b46d2d503fd57f4dc63dfffd922bddbc4f4 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 11 Nov 2019 18:54:37 -0500 Subject: [PATCH 01/84] GCS support working --- smart_open/gcs.py | 499 +++++++++++++++++++++++++++++++++++ smart_open/smart_open_lib.py | 27 +- smart_open/tests/test_gcs.py | 418 +++++++++++++++++++++++++++++ 3 files changed, 943 insertions(+), 1 deletion(-) create mode 100644 smart_open/gcs.py create mode 100644 smart_open/tests/test_gcs.py diff --git a/smart_open/gcs.py b/smart_open/gcs.py new file mode 100644 index 00000000..b79f0680 --- /dev/null +++ b/smart_open/gcs.py @@ -0,0 +1,499 @@ +# -*- coding: utf-8 -*- +"""Implements file-like objects for reading and writing to/from GCS.""" + +import io +import logging +import sys + +from google.cloud import storage +import google.auth.transport.requests as google_requests +import six + +import smart_open.bytebuffer + +logger = logging.getLogger(__name__) + +READ_BINARY = 'rb' +WRITE_BINARY = 'wb' +MODES = (READ_BINARY, WRITE_BINARY) +"""Allowed I/O modes for working with S3.""" + +_BINARY_TYPES = (six.binary_type, bytearray) +"""Allowed binary buffer types for writing to the underlying S3 stream""" +if sys.version_info >= (2, 7): + _BINARY_TYPES = (six.binary_type, bytearray, memoryview) + +_UNKNOWN_FILE_SIZE = '*' + +BINARY_NEWLINE = b'\n' + +SUPPORTED_SCHEMES = ("gcs", "gs") + +DEFAULT_MIN_PART_SIZE = MIN_MIN_PART_SIZE = 256 * 1024 +"""The absolute minimum permitted by Google.""" +DEFAULT_BUFFER_SIZE = 128 * 1024 + +START = 0 +CURRENT = 1 +END = 2 +WHENCE_CHOICES = [START, CURRENT, END] + + +def clamp(value, minval, maxval): + return max(min(value, maxval), minval) + + +def make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): + # + # https://cloud.google.com/storage/docs/xml-api/resumable-upload#step_3upload_the_file_blocks + # + if stop is None: + return 'bytes %d-/%s' % (start, end) + return 'bytes %d-%d/%s' % (start, stop, end) + + +def open( + bucket_id, + blob_id, + mode, + buffer_size=DEFAULT_BUFFER_SIZE, + min_part_size=DEFAULT_MIN_PART_SIZE, + client=None, # type: storage.Client + ): + """Open an GCS blob for reading or writing. + + Parameters + ---------- + bucket_id: str + The name of the bucket this object resides in. + blob_id: str + The name of the blob within the bucket. + mode: str + The mode for opening the object. Must be either "rb" or "wb". + buffer_size: int, optional + The buffer size to use when performing I/O. + min_part_size: int, optional + The minimum part size for multipart uploads. For writing only. + client: object, optional + The GCS client to use when working with google-cloud-storage. + + """ + if mode == READ_BINARY: + return SeekableBufferedInputBase( + bucket_id, + blob_id, + buffer_size=buffer_size, + line_terminator=BINARY_NEWLINE, + client=client, + ) + elif mode == WRITE_BINARY: + return BufferedOutputBase( + bucket_id, + blob_id, + min_part_size=min_part_size, + client=client, + ) + else: + raise NotImplementedError('GCS support for mode %r not implemented' % mode) + + +class RawReader(object): + """Read an GCS blob.""" + def __init__(self, + gcs_blob, # type: storage.Blob + ): + self.position = 0 + self._blob = gcs_blob + self._body = gcs_blob.download_as_string() + + def read(self, size=-1): + if size == -1: + return self._body + start, end = self.position, self.position + size + self.position = end + return self._body[start:end] + + +class SeekableRawReader(object): + """Read an GCS object.""" + + def __init__( + self, + gcs_blob, # type: storage.Blob + size, + ): + self._blob = gcs_blob + self._size = size + self.seek(0) + + def seek(self, position): + """Seek to the specified position (byte offset) in the GCS key. + + :param int position: The byte offset from the beginning of the key. + """ + self._position = position + + if position == self._size == 0 or position == self._size: + # + # When reading, we can't seek to the first byte of an empty file. + # Similarly, we can't seek past the last byte. Do nothing here. + # + self._body = io.BytesIO() + else: + start, end = position, position + self._size + self._body = self._blob.download_as_string(start=start, end=end) + + def read(self, size=-1): + if self._position >= self._size: + return b'' + if size == -1: + binary = self._body + else: + binary = self._body[:size] + self._position += len(binary) + return binary + + +class BufferedInputBase(io.BufferedIOBase): + def __init__( + self, + bucket, + key, + buffer_size=DEFAULT_BUFFER_SIZE, + line_terminator=BINARY_NEWLINE, + client=None, # type: storage.Client + ): + if not client: + client = storage.Client() + + bucket = client.get_bucket(bucket) # type: storage.Bucket + + self._blob = bucket.get_blob(key) # type: storage.Blob + self._size = self._blob.size if self._blob.size else 0 + + self._raw_reader = RawReader(self._blob) + self._current_pos = 0 + self._buffer_size = buffer_size + self._buffer = smart_open.bytebuffer.ByteBuffer(buffer_size) + self._eof = False + self._line_terminator = line_terminator + + # + # This member is part of the io.BufferedIOBase interface. + # + self.raw = None + + # + # Override some methods from io.IOBase. + # + def close(self): + """Flush and close this stream.""" + logger.debug("close: called") + self._blob = None + + def readable(self): + """Return True if the stream can be read from.""" + return True + + def seekable(self): + return False + + # + # io.BufferedIOBase methods. + # + def detach(self): + """Unsupported.""" + raise io.UnsupportedOperation + + def read(self, size=-1): + """Read up to size bytes from the object and return them.""" + if size == 0: + return b'' + elif size < 0: + from_buf = self._read_from_buffer() + self._current_pos = self._size + return from_buf + self._raw_reader.read() + + # + # Return unused data first + # + if len(self._buffer) >= size: + return self._read_from_buffer(size) + + # + # If the stream is finished, return what we have. + # + if self._eof: + return self._read_from_buffer() + + # + # Fill our buffer to the required size. + # + # logger.debug('filling %r byte-long buffer up to %r bytes', len(self._buffer), size) + self._fill_buffer(size) + return self._read_from_buffer(size) + + def read1(self, size=-1): + """This is the same as read().""" + return self.read(size=size) + + def readinto(self, b): + """Read up to len(b) bytes into b, and return the number of bytes + read.""" + data = self.read(len(b)) + if not data: + return 0 + b[:len(data)] = data + return len(data) + + def readline(self, limit=-1): + """Read up to and including the next newline. Returns the bytes read.""" + if limit != -1: + raise NotImplementedError('limits other than -1 not implemented yet') + the_line = io.BytesIO() + while not (self._eof and len(self._buffer) == 0): + # + # In the worst case, we're reading the unread part of self._buffer + # twice here, once in the if condition and once when calling index. + # + # This is sub-optimal, but better than the alternative: wrapping + # .index in a try..except, because that is slower. + # + remaining_buffer = self._buffer.peek() + if self._line_terminator in remaining_buffer: + next_newline = remaining_buffer.index(self._line_terminator) + the_line.write(self._read_from_buffer(next_newline + 1)) + break + else: + the_line.write(self._read_from_buffer()) + self._fill_buffer() + return the_line.getvalue() + + def terminate(self): + """Do nothing.""" + pass + + # + # Internal methods. + # + def _read_from_buffer(self, size=-1): + """Remove at most size bytes from our buffer and return them.""" + # logger.debug('reading %r bytes from %r byte-long buffer', size, len(self._buffer)) + size = size if size >= 0 else len(self._buffer) + part = self._buffer.read(size) + self._current_pos += len(part) + # logger.debug('part: %r', part) + return part + + def _fill_buffer(self, size=-1): + size = size if size >= 0 else self._buffer._chunk_size + while len(self._buffer) < size and not self._eof: + bytes_read = self._buffer.fill(self._raw_reader) + if bytes_read == 0: + logger.debug('reached EOF while filling buffer') + self._eof = True + + +class SeekableBufferedInputBase(BufferedInputBase): + """Reads bytes from GCS. + + Implements the io.BufferedIOBase interface of the standard library.""" + + def __init__( + self, + bucket, + key, + buffer_size=DEFAULT_BUFFER_SIZE, + line_terminator=BINARY_NEWLINE, + client=None, # type: storage.Client + ): + if not client: + client = storage.Client() + bucket = client.get_bucket(bucket) + + self._blob = bucket.get_blob(key) + assert self._blob.exists(), 'Blob %s does not exist!' % key + self._size = self._blob.size if self._blob.size is not None else 0 + + self._raw_reader = SeekableRawReader(self._blob, self._size) + self._current_pos = 0 + self._buffer_size = buffer_size + self._buffer = smart_open.bytebuffer.ByteBuffer(buffer_size) + self._eof = False + self._line_terminator = line_terminator + + # + # This member is part of the io.BufferedIOBase interface. + # + self.raw = None + + def seekable(self): + """If False, seek(), tell() and truncate() will raise IOError. + + We offer only seek support, and no truncate support.""" + return True + + def seek(self, offset, whence=START): + """Seek to the specified position. + + :param int offset: The offset in bytes. + :param int whence: Where the offset is from. + + Returns the position after seeking.""" + logger.debug('seeking to offset: %r whence: %r', offset, whence) + if whence not in WHENCE_CHOICES: + raise ValueError('invalid whence, expected one of %r' % WHENCE_CHOICES) + + if whence == START: + new_position = offset + elif whence == CURRENT: + new_position = self._current_pos + offset + else: + new_position = self._size + offset + new_position = clamp(new_position, 0, self._size) + self._current_pos = new_position + self._raw_reader.seek(new_position) + logger.debug('new_position: %r', self._current_pos) + + self._buffer.empty() + self._eof = self._current_pos == self._size + return self._current_pos + + def tell(self): + """Return the current position within the file.""" + return self._current_pos + + def truncate(self, size=None): + """Unsupported.""" + raise io.UnsupportedOperation + + +class BufferedOutputBase(io.BufferedIOBase): + """Writes bytes to GCS. + + Implements the io.BufferedIOBase interface of the standard library.""" + + def __init__( + self, + bucket, + blob, + min_part_size=DEFAULT_MIN_PART_SIZE, + client=None, # type: storage.Client + ): + if client is None: + client = storage.Client() + self._client = client + self._credentials = self._client._credentials + self._bucket = self._client.bucket(bucket) # type: storage.Bucket + self._blob = self._bucket.blob(blob) # type: storage.Blob + + self._min_part_size = min_part_size + self._total_size = 0 + self._total_parts = 0 + self._buf = io.BytesIO() + + self._session = google_requests.AuthorizedSession(self._credentials) + # https://cloud.google.com/storage/docs/json_api/v1/how-tos/resumable-upload#start-resumable + self._resumeable_upload_url = self._blob.create_resumable_upload_session() + + # + # This member is part of the io.BufferedIOBase interface. + # + self.raw = None + + def flush(self): + pass + + # + # Override some methods from io.IOBase. + # + def close(self): + logger.debug("closing") + if self._total_size == 0: # empty files + self._upload_empty_part() + if self._buf.tell(): + self._upload_next_part() + logger.debug("successfully closed") + + def writable(self): + """Return True if the stream supports writing.""" + return True + + def tell(self): + """Return the current stream position.""" + return self._total_size + + # + # io.BufferedIOBase methods. + # + def detach(self): + raise io.UnsupportedOperation("detach() not supported") + + def write(self, b): + """Write the given bytes (binary string) to the GCS file. + + There's buffering happening under the covers, so this may not actually + do any HTTP transfer right away.""" + + if not isinstance(b, _BINARY_TYPES): + raise TypeError( + "input must be one of %r, got: %r" % (_BINARY_TYPES, type(b))) + + self._buf.write(b) + self._total_size += len(b) + + if self._buf.tell() >= self._min_part_size: + self._upload_next_part() + + return len(b) + + def terminate(self): + # https://cloud.google.com/storage/docs/xml-api/resumable-upload#example_cancelling_an_upload + self._session.delete(self._resumeable_upload_url) + + # + # Internal methods. + # + def _upload_next_part(self): + part_num = self._total_parts + 1 + logger.info("uploading part #%i, %i bytes (total %.3fGB)", + part_num, self._buf.tell(), self._total_size / 1024.0 ** 3) + content_length = self._buf.tell() + start = self._total_size - content_length + stop = self._total_size - 1 + if content_length < MIN_MIN_PART_SIZE: + end = content_length + else: + end = _UNKNOWN_FILE_SIZE + self._buf.seek(0) + + headers = { + 'Content-Length': str(content_length), + 'Content-Range': make_range_string(start, stop, end) + } + # TODO: Add error handling / retrying here + response = self._session.put(self._resumeable_upload_url, data=self._buf, headers=headers) + assert response.status_code in (200, 201) + logger.debug("upload of part #%i finished" % part_num) + + self._total_parts += 1 + self._buf = io.BytesIO() + + def _upload_empty_part(self): + logger.info("creating empty file") + headers = { + 'Content-Length': '0' + } + response = self._session.put(self._resumeable_upload_url, headers=headers) + assert response.status_code in (200, 201) + + self._total_parts += 1 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is not None: + self.terminate() + else: + self.close() diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 26bc7316..d90608b4 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -53,6 +53,7 @@ import smart_open.webhdfs as smart_open_webhdfs import smart_open.http as smart_open_http import smart_open.ssh as smart_open_ssh +import smart_open.gcs as smart_open_gcs from smart_open import doctools @@ -124,6 +125,7 @@ def _handle_gzip(file_obj, mode): 'uri_path', 'bucket_id', 'key_id', + 'blob_id', 'port', 'host', 'ordinary_calling_format', @@ -135,7 +137,7 @@ def _handle_gzip(file_obj, mode): """Represents all the options that we parse from user input. Some of the above options only make sense for certain protocols, e.g. -bucket_id is only for S3. +bucket_id is only for S3 and GCS. """ # # Set the default values for all Uri fields to be None. This allows us to only @@ -376,6 +378,10 @@ def open( doctools.extract_kwargs(smart_open_ssh.open.__doc__), lpad=u' ', ), + 'gcs': doctools.to_docstring( + doctools.extract_kwargs(smart_open_gcs.open.__doc__), + lpad=u' ', + ), 'examples': doctools.extract_examples_from_readme_rst(), } @@ -568,6 +574,9 @@ def _open_binary_stream(uri, mode, transport_params): filename = P.basename(urlparse.urlparse(uri).path) kw = _check_kwargs(smart_open_http.open, transport_params) return smart_open_http.open(uri, mode, **kw), filename + elif parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES: + kw =_check_kwargs(smart_open_gcs.open, transport_params) + return smart_open_gcs.open(parsed_uri.bucket_id, parsed_uri.blob_id, mode, **kw), filename else: raise NotImplementedError("scheme %r is not supported", parsed_uri.scheme) elif hasattr(uri, 'read'): @@ -676,6 +685,8 @@ def _parse_uri(uri_as_string): Supported URI schemes are: * file + * gcs + * gs * hdfs * http * https @@ -686,6 +697,7 @@ def _parse_uri(uri_as_string): * webhdfs .s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. + .gcs and gcs are treated the same way. Valid URI examples:: @@ -703,6 +715,7 @@ def _parse_uri(uri_as_string): * file:///home/user/file.bz2 * [ssh|scp|sftp]://username@host//path/file * [ssh|scp|sftp]://username@host/path/file + * gs://my_bucket/my_blob """ if os.name == 'nt': @@ -727,6 +740,8 @@ def _parse_uri(uri_as_string): return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string) elif parsed_uri.scheme in smart_open_ssh.SCHEMES: return _parse_uri_ssh(parsed_uri) + elif parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES: + return _parse_uri_gcs(parsed_uri) else: raise NotImplementedError( "unknown URI scheme %r in %r" % (parsed_uri.scheme, uri_as_string) @@ -840,6 +855,16 @@ def _parse_uri_ssh(unt): return Uri(scheme=unt.scheme, uri_path=unt.path, user=user, host=host, port=port) +def _parse_uri_gcs(parsed_uri): + assert parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES + + bucket_id, blob_id = parsed_uri.netloc, parsed_uri.path[1:] + + return Uri( + scheme=parsed_uri.scheme, bucket_id=bucket_id, blob_id=blob_id, + ) + + def _need_to_buffer(file_obj, mode, ext): """Returns True if we need to buffer the whole file in memory in order to proceed.""" try: diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py new file mode 100644 index 00000000..5142fd2c --- /dev/null +++ b/smart_open/tests/test_gcs.py @@ -0,0 +1,418 @@ +import uuid +import logging +import time +import warnings +import unittest +import io +import gzip + +import google.cloud +import google.api_core.exceptions +import six + +import smart_open + +BUCKET_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) +BLOB_NAME = 'test-blob' +WRITE_BLOB_NAME = 'test-write-blob' + + +logger = logging.getLogger(__name__) +storage_client = google.cloud.storage.Client() + + +def setUpModule(): + '''Called once by unittest when initializing this module. Sets up the + test GCS bucket. + + ''' + storage_client.create_bucket(BUCKET_NAME) + + +def tearDownModule(): + '''Called once by unittest when tearing down this module. Empties and + removes the test GCS bucket. + + ''' + try: + bucket = get_bucket() + bucket.delete() + except google.cloud.exceptions.NotFound: + pass + + +def get_bucket(): + return storage_client.bucket(BUCKET_NAME) + + +def get_blob(): + bucket = get_bucket() + return bucket.blob(BLOB_NAME) + + +def cleanup_bucket(): + bucket = get_bucket() + + blobs = bucket.list_blobs() + for blob in blobs: + blob.delete() + + +def put_to_bucket(contents, num_attempts=12, sleep_time=5): + # fake (or not) connection, bucket and key + logger.debug('%r', locals()) + + # + # In real life, it can take a few seconds for the bucket to become ready. + # If we try to write to the key while the bucket while it isn't ready, we + # will get a StorageError: NotFound. + # + for attempt in range(num_attempts): + try: + blob = get_blob() + blob.upload_from_string(contents) + return + except google.cloud.exceptions.NotFound as err: + logger.error('caught %r, retrying', err) + time.sleep(sleep_time) + + assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) + + +def ignore_resource_warnings(): + if six.PY2: + return + warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") + + +class SeekableBufferedInputBaseTest(unittest.TestCase): + def setUp(self): + # lower the multipart upload size, to speed up these tests + self.old_min_buffer_size = smart_open.gcs.DEFAULT_BUFFER_SIZE + smart_open.gcs.DEFAULT_BUFFER_SIZE = 5 * 1024**2 + + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + def test_iter(self): + """Are GCS files iterated over correctly?""" + # a list of strings to test with + expected = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=expected) + + # connect to fake GCS and read from the fake key we filled above + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + output = [line.rstrip(b'\n') for line in fin] + self.assertEqual(output, expected.split(b'\n')) + + def test_iter_context_manager(self): + # same thing but using a context manager + expected = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=expected) + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + output = [line.rstrip(b'\n') for line in fin] + self.assertEqual(output, expected.split(b'\n')) + + def test_read(self): + """Are GCS files read correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + logger.debug('content: %r len: %r', content, len(content)) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(content[:6], fin.read(6)) + self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes + self.assertEqual(content[14:], fin.read()) # read the rest + + def test_seek_beginning(self): + """Does seeking to the beginning of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(content[:6], fin.read(6)) + self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes + + fin.seek(0) + self.assertEqual(content, fin.read()) # no size given => read whole file + + fin.seek(0) + self.assertEqual(content, fin.read(-1)) # same thing + + def test_seek_start(self): + """Does seeking from the start of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + seek = fin.seek(6) + self.assertEqual(seek, 6) + self.assertEqual(fin.tell(), 6) + self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) + + def test_seek_current(self): + """Does seeking from the middle of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(fin.read(5), b'hello') + seek = fin.seek(1, whence=smart_open.gcs.CURRENT) + self.assertEqual(seek, 6) + self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) + + def test_seek_end(self): + """Does seeking from the end of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + seek = fin.seek(-4, whence=smart_open.gcs.END) + self.assertEqual(seek, len(content) - 4) + self.assertEqual(fin.read(), b'you?') + + def test_detect_eof(self): + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + fin.read() + eof = fin.tell() + self.assertEqual(eof, len(content)) + fin.seek(0, whence=smart_open.gcs.END) + self.assertEqual(eof, fin.tell()) + + def test_read_gzip(self): + expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') + buf = io.BytesIO() + buf.close = lambda: None # keep buffer open so that we can .getvalue() + with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: + zipfile.write(expected) + put_to_bucket(contents=buf.getvalue()) + + # + # Make sure we're reading things correctly. + # + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + self.assertEqual(fin.read(), buf.getvalue()) + + # + # Make sure the buffer we wrote is legitimate gzip. + # + sanity_buf = io.BytesIO(buf.getvalue()) + with gzip.GzipFile(fileobj=sanity_buf) as zipfile: + self.assertEqual(zipfile.read(), expected) + + logger.debug('starting actual test') + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + with gzip.GzipFile(fileobj=fin) as zipfile: + actual = zipfile.read() + + self.assertEqual(expected, actual) + + def test_readline(self): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + fin.readline() + self.assertEqual(fin.tell(), content.index(b'\n')+1) + + fin.seek(0) + actual = list(fin) + self.assertEqual(fin.tell(), len(content)) + + expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] + self.assertEqual(expected, actual) + + def test_readline_tiny_buffer(self): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: + actual = list(fin) + + expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] + self.assertEqual(expected, actual) + + def test_read0_does_not_return_data(self): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + data = fin.read(0) + + self.assertEqual(data, b'') + + +class BufferedOutputBaseTest(unittest.TestCase): + """ + Test writing into GCS files. + + """ + def setUp(self): + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + def test_write_01(self): + """Does writing into GCS work correctly?""" + test_string = u"žluťoučký koníček".encode('utf8') + + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + fout.write(test_string) + + output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb")) + + self.assertEqual(output, [test_string]) + + def test_write_01a(self): + """Does gcs write fail on incorrect input?""" + try: + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: + fin.write(None) + except TypeError: + pass + else: + self.fail() + + def test_write_02(self): + """Does gcs write unicode-utf8 conversion work?""" + smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) + smart_open_write.tell() + logger.info("smart_open_write: %r", smart_open_write) + with smart_open_write as fout: + fout.write(u"testžížáč".encode("utf-8")) + self.assertEqual(fout.tell(), 14) + + def test_write_03(self): + """Does gcs multipart chunking work correctly?""" + # write + smart_open_write = smart_open.gcs.BufferedOutputBase( + BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=10 + ) + with smart_open_write as fout: + fout.write(b"test") + self.assertEqual(fout._buf.tell(), 4) + + fout.write(b"test\n") + self.assertEqual(fout._buf.tell(), 9) + self.assertEqual(fout._total_parts, 0) + + fout.write(b"test") + self.assertEqual(fout._buf.tell(), 0) + self.assertEqual(fout._total_parts, 1) + + # read back the same key and check its content + output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + self.assertEqual(output, ["testtest\n", "test"]) + + def test_write_04(self): + """Does writing no data cause key with an empty value to be created?""" + smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) + with smart_open_write as fout: # noqa + pass + + # read back the same key and check its content + output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + + self.assertEqual(output, []) + + def test_gzip(self): + expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: + zipfile.write(expected) + + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: + with gzip.GzipFile(fileobj=fin) as zipfile: + actual = zipfile.read() + + self.assertEqual(expected, actual) + + def test_buffered_writer_wrapper_works(self): + """ + Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which + passes a memoryview object to the underlying stream in python >= 2.7 + """ + expected = u'не думай о секундах свысока' + + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + with io.BufferedWriter(fout) as sub_out: + sub_out.write(expected.encode('utf-8')) + + with smart_open.smart_open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: + with io.TextIOWrapper(fin, encoding='utf-8') as text: + actual = text.read() + + self.assertEqual(expected, actual) + + def test_binary_iterator(self): + expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') + put_to_bucket(contents=b"\n".join(expected)) + with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, 'rb') as fin: + actual = [line.rstrip() for line in fin] + self.assertEqual(expected, actual) + + def test_nonexisting_bucket(self): + expected = u"выйду ночью в поле с конём".encode('utf-8') + with self.assertRaises(google.api_core.exceptions.NotFound): + with smart_open.gcs.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: + fout.write(expected) + + def test_read_nonexisting_key(self): + with self.assertRaises(AttributeError): + with smart_open.gcs.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: + fin.read() + + def test_double_close(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.close() + fout.close() + + def test_flush_close(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.flush() + fout.close() + + def test_terminate(self): + #TODO: Write this test + pass + + +class OpenTest(unittest.TestCase): + def setUp(self): + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + def test_read_never_returns_none(self): + """read should never return None.""" + test_string = u"ветер по морю гуляет..." + with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "wb") as fout: + fout.write(test_string.encode('utf8')) + + r = smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "rb") + self.assertEqual(r.read(), test_string.encode("utf-8")) + self.assertEqual(r.read(), b"") + self.assertEqual(r.read(), b"") + + +class ClampTest(unittest.TestCase): + def test(self): + self.assertEqual(smart_open.gcs.clamp(5, 0, 10), 5) + self.assertEqual(smart_open.gcs.clamp(11, 0, 10), 10) + self.assertEqual(smart_open.gcs.clamp(-1, 0, 10), 0) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + unittest.main() From 45e1019caa66cd8768725a77d868c14ea6f3784d Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 11 Nov 2019 20:44:15 -0500 Subject: [PATCH 02/84] Start updating setup.py and README --- README.rst | 12 +++++++++++- setup.py | 5 +++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 68968074..7b016822 100644 --- a/README.rst +++ b/README.rst @@ -12,7 +12,7 @@ smart_open — utils for streaming large files in Python What? ===== -``smart_open`` is a Python 2 & Python 3 library for **efficient streaming of very large files** from/to storages such as S3, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats. +``smart_open`` is a Python 2 & Python 3 library for **efficient streaming of very large files** from/to storages such as S3, GCS, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats. ``smart_open`` is a drop-in replacement for Python's built-in ``open()``: it can do anything ``open`` can (100% compatible, falls back to native ``open`` wherever possible), plus lots of nifty extra stuff on top. @@ -80,6 +80,7 @@ Other examples of URLs that ``smart_open`` accepts:: s3://my_bucket/my_key s3://my_key:my_secret@my_bucket/my_key s3://my_key:my_secret@my_server:my_port@my_bucket/my_key + gs://my_bucket/my_blob hdfs:///path/file hdfs://path/file webhdfs://host:port/path/file @@ -171,6 +172,14 @@ More examples with open('s3://bucket/key.txt', 'wb', transport_params=transport_params) as fout: fout.write(b'here we stand') + # stream from GCS + for line in open('gs://my_bucket/my_file.txt'): + print(line) + + # stream content *into* GCS (write mode): + with open('gs://my_bucket/my_file.txt') as fout: + fout.write(b'hello world') + Supported Compression Formats ----------------------------- @@ -209,6 +218,7 @@ Transport-specific Options - HTTP, HTTPS (read-only) - SSH, SCP and SFTP - WebHDFS +- GCS Each option involves setting up its own set of parameters. For example, for accessing S3, you often need to set up authentication, like API keys or a profile name. diff --git a/setup.py b/setup.py index 7fd3d7df..4baff204 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ def _get_version(): 'boto >= 2.32', 'requests', 'boto3', + 'google-cloud-storage', ] if sys.version_info[0] == 2: install_requires.append('bz2file') @@ -54,7 +55,7 @@ def _get_version(): setup( name='smart_open', version=_get_version(), - description='Utils for streaming large files (S3, HDFS, gzip, bz2...)', + description='Utils for streaming large files (S3, HDFS, GCS, gzip, bz2...)', long_description=read('README.rst'), packages=find_packages(), @@ -71,7 +72,7 @@ def _get_version(): url='https://github.com/piskvorky/smart_open', download_url='http://pypi.python.org/pypi/smart_open', - keywords='file streaming, s3, hdfs', + keywords='file streaming, s3, hdfs, gcs', license='MIT', platforms='any', From ae6c25280fda38f149cf0d909fc742816d6efdd0 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 11 Nov 2019 20:45:15 -0500 Subject: [PATCH 03/84] Add .idea/ to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index cc526858..deb05a87 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,6 @@ target/ # vim *.swp *.swo + +# PyCharm +.idea/ \ No newline at end of file From 38905aed490b6e5621d480a66895db33c0b04580 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Tue, 12 Nov 2019 08:11:17 -0500 Subject: [PATCH 04/84] Move integration tests to their proper location --- integration-tests/test_gcs.py | 414 +++++++++++++++++++++++++++ smart_open/tests/test_gcs.py | 510 ++++++++-------------------------- 2 files changed, 532 insertions(+), 392 deletions(-) create mode 100644 integration-tests/test_gcs.py diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py new file mode 100644 index 00000000..cc2edf3a --- /dev/null +++ b/integration-tests/test_gcs.py @@ -0,0 +1,414 @@ +import gzip +import io +import logging +import time +import uuid +import unittest +import warnings + +import google.cloud +import google.api_core.exceptions +import six + +import smart_open + +BUCKET_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) +BLOB_NAME = 'test-blob' +WRITE_BLOB_NAME = 'test-write-blob' + +logger = logging.getLogger(__name__) +storage_client = google.cloud.storage.Client() + + +def setUpModule(): + '''Called once by unittest when initializing this module. Sets up the + test GCS bucket. + + ''' + storage_client.create_bucket(BUCKET_NAME) + + +def tearDownModule(): + '''Called once by unittest when tearing down this module. Empties and + removes the test GCS bucket. + + ''' + try: + bucket = get_bucket() + bucket.delete() + except google.cloud.exceptions.NotFound: + pass + + +def get_bucket(): + return storage_client.bucket(BUCKET_NAME) + + +def get_blob(): + bucket = get_bucket() + return bucket.blob(BLOB_NAME) + + +def cleanup_bucket(): + bucket = get_bucket() + + blobs = bucket.list_blobs() + for blob in blobs: + blob.delete() + + +def put_to_bucket(contents, num_attempts=12, sleep_time=5): + logger.debug('%r', locals()) + + # + # In real life, it can take a few seconds for the bucket to become ready. + # If we try to write to the key while the bucket while it isn't ready, we + # will get a StorageError: NotFound. + # + for attempt in range(num_attempts): + try: + blob = get_blob() + blob.upload_from_string(contents) + return + except google.cloud.exceptions.NotFound as err: + logger.error('caught %r, retrying', err) + time.sleep(sleep_time) + + assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) + + +def ignore_resource_warnings(): + if six.PY2: + return + warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") + + +class SeekableBufferedInputBaseTest(unittest.TestCase): + def setUp(self): + # lower the multipart upload size, to speed up these tests + self.old_min_buffer_size = smart_open.gcs.DEFAULT_BUFFER_SIZE + smart_open.gcs.DEFAULT_BUFFER_SIZE = 5 * 1024**2 + + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + def test_iter(self): + """Are GCS files iterated over correctly?""" + # a list of strings to test with + expected = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=expected) + + # connect to fake GCS and read from the fake key we filled above + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + output = [line.rstrip(b'\n') for line in fin] + self.assertEqual(output, expected.split(b'\n')) + + def test_iter_context_manager(self): + # same thing but using a context manager + expected = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=expected) + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + output = [line.rstrip(b'\n') for line in fin] + self.assertEqual(output, expected.split(b'\n')) + + def test_read(self): + """Are GCS files read correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + logger.debug('content: %r len: %r', content, len(content)) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(content[:6], fin.read(6)) + self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes + self.assertEqual(content[14:], fin.read()) # read the rest + + def test_seek_beginning(self): + """Does seeking to the beginning of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(content[:6], fin.read(6)) + self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes + + fin.seek(0) + self.assertEqual(content, fin.read()) # no size given => read whole file + + fin.seek(0) + self.assertEqual(content, fin.read(-1)) # same thing + + def test_seek_start(self): + """Does seeking from the start of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + seek = fin.seek(6) + self.assertEqual(seek, 6) + self.assertEqual(fin.tell(), 6) + self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) + + def test_seek_current(self): + """Does seeking from the middle of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(fin.read(5), b'hello') + seek = fin.seek(1, whence=smart_open.gcs.CURRENT) + self.assertEqual(seek, 6) + self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) + + def test_seek_end(self): + """Does seeking from the end of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + seek = fin.seek(-4, whence=smart_open.gcs.END) + self.assertEqual(seek, len(content) - 4) + self.assertEqual(fin.read(), b'you?') + + def test_detect_eof(self): + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + fin.read() + eof = fin.tell() + self.assertEqual(eof, len(content)) + fin.seek(0, whence=smart_open.gcs.END) + self.assertEqual(eof, fin.tell()) + + def test_read_gzip(self): + expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') + buf = io.BytesIO() + buf.close = lambda: None # keep buffer open so that we can .getvalue() + with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: + zipfile.write(expected) + put_to_bucket(contents=buf.getvalue()) + + # + # Make sure we're reading things correctly. + # + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + self.assertEqual(fin.read(), buf.getvalue()) + + # + # Make sure the buffer we wrote is legitimate gzip. + # + sanity_buf = io.BytesIO(buf.getvalue()) + with gzip.GzipFile(fileobj=sanity_buf) as zipfile: + self.assertEqual(zipfile.read(), expected) + + logger.debug('starting actual test') + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + with gzip.GzipFile(fileobj=fin) as zipfile: + actual = zipfile.read() + + self.assertEqual(expected, actual) + + def test_readline(self): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + fin.readline() + self.assertEqual(fin.tell(), content.index(b'\n')+1) + + fin.seek(0) + actual = list(fin) + self.assertEqual(fin.tell(), len(content)) + + expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] + self.assertEqual(expected, actual) + + def test_readline_tiny_buffer(self): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: + actual = list(fin) + + expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] + self.assertEqual(expected, actual) + + def test_read0_does_not_return_data(self): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + data = fin.read(0) + + self.assertEqual(data, b'') + + +class BufferedOutputBaseTest(unittest.TestCase): + """ + Test writing into GCS files. + + """ + def setUp(self): + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + def test_write_01(self): + """Does writing into GCS work correctly?""" + test_string = u"žluťoučký koníček".encode('utf8') + + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + fout.write(test_string) + + output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb")) + + self.assertEqual(output, [test_string]) + + def test_write_01a(self): + """Does gcs write fail on incorrect input?""" + try: + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: + fin.write(None) + except TypeError: + pass + else: + self.fail() + + def test_write_02(self): + """Does gcs write unicode-utf8 conversion work?""" + smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) + smart_open_write.tell() + logger.info("smart_open_write: %r", smart_open_write) + with smart_open_write as fout: + fout.write(u"testžížáč".encode("utf-8")) + self.assertEqual(fout.tell(), 14) + + def test_write_03(self): + """Does gcs multipart chunking work correctly?""" + # write + smart_open_write = smart_open.gcs.BufferedOutputBase( + BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=10 + ) + with smart_open_write as fout: + fout.write(b"test") + self.assertEqual(fout._buf.tell(), 4) + + fout.write(b"test\n") + self.assertEqual(fout._buf.tell(), 9) + self.assertEqual(fout._total_parts, 0) + + fout.write(b"test") + self.assertEqual(fout._buf.tell(), 0) + self.assertEqual(fout._total_parts, 1) + + # read back the same key and check its content + output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + self.assertEqual(output, ["testtest\n", "test"]) + + def test_write_04(self): + """Does writing no data cause key with an empty value to be created?""" + smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) + with smart_open_write as fout: # noqa + pass + + # read back the same key and check its content + output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + + self.assertEqual(output, []) + + def test_gzip(self): + expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: + zipfile.write(expected) + + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: + with gzip.GzipFile(fileobj=fin) as zipfile: + actual = zipfile.read() + + self.assertEqual(expected, actual) + + def test_buffered_writer_wrapper_works(self): + """ + Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which + passes a memoryview object to the underlying stream in python >= 2.7 + """ + expected = u'не думай о секундах свысока' + + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + with io.BufferedWriter(fout) as sub_out: + sub_out.write(expected.encode('utf-8')) + + with smart_open.smart_open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: + with io.TextIOWrapper(fin, encoding='utf-8') as text: + actual = text.read() + + self.assertEqual(expected, actual) + + def test_binary_iterator(self): + expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') + put_to_bucket(contents=b"\n".join(expected)) + with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, 'rb') as fin: + actual = [line.rstrip() for line in fin] + self.assertEqual(expected, actual) + + def test_nonexisting_bucket(self): + expected = u"выйду ночью в поле с конём".encode('utf-8') + with self.assertRaises(google.api_core.exceptions.NotFound): + with smart_open.gcs.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: + fout.write(expected) + + def test_read_nonexisting_key(self): + with self.assertRaises(AttributeError): + with smart_open.gcs.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: + fin.read() + + def test_double_close(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.close() + fout.close() + + def test_flush_close(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.flush() + fout.close() + + def test_terminate(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.terminate() + + with self.assertRaises(AttributeError): + with smart_open.gcs.open(BUCKET_NAME, 'key', 'rb') as fin: + fin.read() + + +class OpenTest(unittest.TestCase): + def setUp(self): + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + def test_read_never_returns_none(self): + """read should never return None.""" + test_string = u"ветер по морю гуляет..." + with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "wb") as fout: + fout.write(test_string.encode('utf8')) + + r = smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "rb") + self.assertEqual(r.read(), test_string.encode("utf-8")) + self.assertEqual(r.read(), b"") + self.assertEqual(r.read(), b"") + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + unittest.main() diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 5142fd2c..76e1a9c4 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -1,409 +1,125 @@ -import uuid import logging -import time -import warnings import unittest -import io -import gzip - -import google.cloud -import google.api_core.exceptions -import six import smart_open -BUCKET_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) -BLOB_NAME = 'test-blob' -WRITE_BLOB_NAME = 'test-write-blob' - +BYTES = b'i tried so hard and got so far but in the end it doesn\'t even matter' +HEADERS = { + 'Content-Length': str(len(BYTES)), +} logger = logging.getLogger(__name__) -storage_client = google.cloud.storage.Client() - - -def setUpModule(): - '''Called once by unittest when initializing this module. Sets up the - test GCS bucket. - - ''' - storage_client.create_bucket(BUCKET_NAME) - - -def tearDownModule(): - '''Called once by unittest when tearing down this module. Empties and - removes the test GCS bucket. - - ''' - try: - bucket = get_bucket() - bucket.delete() - except google.cloud.exceptions.NotFound: - pass - - -def get_bucket(): - return storage_client.bucket(BUCKET_NAME) - - -def get_blob(): - bucket = get_bucket() - return bucket.blob(BLOB_NAME) - - -def cleanup_bucket(): - bucket = get_bucket() - - blobs = bucket.list_blobs() - for blob in blobs: - blob.delete() - - -def put_to_bucket(contents, num_attempts=12, sleep_time=5): - # fake (or not) connection, bucket and key - logger.debug('%r', locals()) - - # - # In real life, it can take a few seconds for the bucket to become ready. - # If we try to write to the key while the bucket while it isn't ready, we - # will get a StorageError: NotFound. - # - for attempt in range(num_attempts): - try: - blob = get_blob() - blob.upload_from_string(contents) - return - except google.cloud.exceptions.NotFound as err: - logger.error('caught %r, retrying', err) - time.sleep(sleep_time) - - assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) - - -def ignore_resource_warnings(): - if six.PY2: - return - warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") - - -class SeekableBufferedInputBaseTest(unittest.TestCase): - def setUp(self): - # lower the multipart upload size, to speed up these tests - self.old_min_buffer_size = smart_open.gcs.DEFAULT_BUFFER_SIZE - smart_open.gcs.DEFAULT_BUFFER_SIZE = 5 * 1024**2 - - ignore_resource_warnings() - - def tearDown(self): - cleanup_bucket() - - def test_iter(self): - """Are GCS files iterated over correctly?""" - # a list of strings to test with - expected = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=expected) - - # connect to fake GCS and read from the fake key we filled above - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - output = [line.rstrip(b'\n') for line in fin] - self.assertEqual(output, expected.split(b'\n')) - - def test_iter_context_manager(self): - # same thing but using a context manager - expected = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=expected) - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - output = [line.rstrip(b'\n') for line in fin] - self.assertEqual(output, expected.split(b'\n')) - - def test_read(self): - """Are GCS files read correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - logger.debug('content: %r len: %r', content, len(content)) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - self.assertEqual(content[:6], fin.read(6)) - self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes - self.assertEqual(content[14:], fin.read()) # read the rest - - def test_seek_beginning(self): - """Does seeking to the beginning of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - self.assertEqual(content[:6], fin.read(6)) - self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes - - fin.seek(0) - self.assertEqual(content, fin.read()) # no size given => read whole file - - fin.seek(0) - self.assertEqual(content, fin.read(-1)) # same thing - - def test_seek_start(self): - """Does seeking from the start of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - seek = fin.seek(6) - self.assertEqual(seek, 6) - self.assertEqual(fin.tell(), 6) - self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) - - def test_seek_current(self): - """Does seeking from the middle of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - self.assertEqual(fin.read(5), b'hello') - seek = fin.seek(1, whence=smart_open.gcs.CURRENT) - self.assertEqual(seek, 6) - self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) - - def test_seek_end(self): - """Does seeking from the end of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - seek = fin.seek(-4, whence=smart_open.gcs.END) - self.assertEqual(seek, len(content) - 4) - self.assertEqual(fin.read(), b'you?') - - def test_detect_eof(self): - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - fin.read() - eof = fin.tell() - self.assertEqual(eof, len(content)) - fin.seek(0, whence=smart_open.gcs.END) - self.assertEqual(eof, fin.tell()) - - def test_read_gzip(self): - expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') - buf = io.BytesIO() - buf.close = lambda: None # keep buffer open so that we can .getvalue() - with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: - zipfile.write(expected) - put_to_bucket(contents=buf.getvalue()) - - # - # Make sure we're reading things correctly. - # - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - self.assertEqual(fin.read(), buf.getvalue()) - - # - # Make sure the buffer we wrote is legitimate gzip. - # - sanity_buf = io.BytesIO(buf.getvalue()) - with gzip.GzipFile(fileobj=sanity_buf) as zipfile: - self.assertEqual(zipfile.read(), expected) - logger.debug('starting actual test') - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - with gzip.GzipFile(fileobj=fin) as zipfile: - actual = zipfile.read() - - self.assertEqual(expected, actual) - - def test_readline(self): - content = b'englishman\nin\nnew\nyork\n' - put_to_bucket(contents=content) - - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - fin.readline() - self.assertEqual(fin.tell(), content.index(b'\n')+1) +class GCSTest(unittest.TestCase): + @responses.activate + def test_read_all(self): + responses.add(responses.GET, URL, body=BYTES, stream=True) + reader = smart_open.http.SeekableBufferedInputBase(URL) + read_bytes = reader.read() + self.assertEqual(BYTES, read_bytes) + + @responses.activate + def test_seek_from_start(self): + responses.add_callback(responses.GET, URL, callback=request_callback) + reader = smart_open.http.SeekableBufferedInputBase(URL) + + reader.seek(10) + self.assertEqual(reader.tell(), 10) + read_bytes = reader.read(size=10) + self.assertEqual(reader.tell(), 20) + self.assertEqual(BYTES[10:20], read_bytes) + + reader.seek(20) + read_bytes = reader.read(size=10) + self.assertEqual(BYTES[20:30], read_bytes) + + reader.seek(0) + read_bytes = reader.read(size=10) + self.assertEqual(BYTES[:10], read_bytes) + + @responses.activate + def test_seek_from_current(self): + responses.add_callback(responses.GET, URL, callback=request_callback) + reader = smart_open.http.SeekableBufferedInputBase(URL) + + reader.seek(10) + read_bytes = reader.read(size=10) + self.assertEqual(BYTES[10:20], read_bytes) + + self.assertEqual(reader.tell(), 20) + reader.seek(10, whence=smart_open.s3.CURRENT) + self.assertEqual(reader.tell(), 30) + read_bytes = reader.read(size=10) + self.assertEqual(reader.tell(), 40) + self.assertEqual(BYTES[30:40], read_bytes) + + @responses.activate + def test_seek_from_end(self): + responses.add_callback(responses.GET, URL, callback=request_callback) + reader = smart_open.http.SeekableBufferedInputBase(URL) + + reader.seek(-10, whence=smart_open.s3.END) + self.assertEqual(reader.tell(), len(BYTES) - 10) + read_bytes = reader.read(size=10) + self.assertEqual(reader.tell(), len(BYTES)) + self.assertEqual(BYTES[-10:], read_bytes) + + @responses.activate + def test_headers_are_as_assigned(self): + responses.add_callback(responses.GET, URL, callback=request_callback) + + # use default _HEADERS + x = smart_open.http.BufferedInputBase(URL) + # set different ones + x.headers['Accept-Encoding'] = 'compress, gzip' + x.headers['Other-Header'] = 'value' + + # use default again, global shoudn't overwritten from x + y = smart_open.http.BufferedInputBase(URL) + # should be default headers + self.assertEqual(y.headers, {'Accept-Encoding': 'identity'}) + # should be assigned headers + self.assertEqual(x.headers, {'Accept-Encoding': 'compress, gzip', 'Other-Header': 'value'}) + + @responses.activate + def test_headers(self): + """Does the top-level http.open function handle headers correctly?""" + responses.add_callback(responses.GET, URL, callback=request_callback) + reader = smart_open.http.open(URL, 'rb', headers={'Foo': 'bar'}) + self.assertEqual(reader.headers['Foo'], 'bar') + + @responses.activate + def test_https_seek_start(self): + """Did the seek start over HTTPS work?""" + responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) + + with smart_open.open(HTTPS_URL, "rb") as fin: + read_bytes_1 = fin.read(size=10) fin.seek(0) - actual = list(fin) - self.assertEqual(fin.tell(), len(content)) - - expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] - self.assertEqual(expected, actual) - - def test_readline_tiny_buffer(self): - content = b'englishman\nin\nnew\nyork\n' - put_to_bucket(contents=content) - - with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: - actual = list(fin) - - expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] - self.assertEqual(expected, actual) - - def test_read0_does_not_return_data(self): - content = b'englishman\nin\nnew\nyork\n' - put_to_bucket(contents=content) - - with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - data = fin.read(0) - - self.assertEqual(data, b'') - - -class BufferedOutputBaseTest(unittest.TestCase): - """ - Test writing into GCS files. - - """ - def setUp(self): - ignore_resource_warnings() - - def tearDown(self): - cleanup_bucket() - - def test_write_01(self): - """Does writing into GCS work correctly?""" - test_string = u"žluťoučký koníček".encode('utf8') - - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: - fout.write(test_string) - - output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb")) - - self.assertEqual(output, [test_string]) - - def test_write_01a(self): - """Does gcs write fail on incorrect input?""" - try: - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: - fin.write(None) - except TypeError: - pass - else: - self.fail() - - def test_write_02(self): - """Does gcs write unicode-utf8 conversion work?""" - smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) - smart_open_write.tell() - logger.info("smart_open_write: %r", smart_open_write) - with smart_open_write as fout: - fout.write(u"testžížáč".encode("utf-8")) - self.assertEqual(fout.tell(), 14) - - def test_write_03(self): - """Does gcs multipart chunking work correctly?""" - # write - smart_open_write = smart_open.gcs.BufferedOutputBase( - BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=10 - ) - with smart_open_write as fout: - fout.write(b"test") - self.assertEqual(fout._buf.tell(), 4) - - fout.write(b"test\n") - self.assertEqual(fout._buf.tell(), 9) - self.assertEqual(fout._total_parts, 0) + read_bytes_2 = fin.read(size=10) + self.assertEqual(read_bytes_1, read_bytes_2) - fout.write(b"test") - self.assertEqual(fout._buf.tell(), 0) - self.assertEqual(fout._total_parts, 1) + @responses.activate + def test_https_seek_forward(self): + """Did the seek forward over HTTPS work?""" + responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) - # read back the same key and check its content - output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) - self.assertEqual(output, ["testtest\n", "test"]) + with smart_open.open(HTTPS_URL, "rb") as fin: + fin.seek(10) + read_bytes = fin.read(size=10) + self.assertEqual(BYTES[10:20], read_bytes) - def test_write_04(self): - """Does writing no data cause key with an empty value to be created?""" - smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) - with smart_open_write as fout: # noqa - pass + @responses.activate + def test_https_seek_reverse(self): + """Did the seek in reverse over HTTPS work?""" + responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) - # read back the same key and check its content - output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) - - self.assertEqual(output, []) - - def test_gzip(self): - expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: - with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: - zipfile.write(expected) - - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: - with gzip.GzipFile(fileobj=fin) as zipfile: - actual = zipfile.read() - - self.assertEqual(expected, actual) - - def test_buffered_writer_wrapper_works(self): - """ - Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which - passes a memoryview object to the underlying stream in python >= 2.7 - """ - expected = u'не думай о секундах свысока' - - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: - with io.BufferedWriter(fout) as sub_out: - sub_out.write(expected.encode('utf-8')) - - with smart_open.smart_open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: - with io.TextIOWrapper(fin, encoding='utf-8') as text: - actual = text.read() - - self.assertEqual(expected, actual) - - def test_binary_iterator(self): - expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') - put_to_bucket(contents=b"\n".join(expected)) - with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, 'rb') as fin: - actual = [line.rstrip() for line in fin] - self.assertEqual(expected, actual) - - def test_nonexisting_bucket(self): - expected = u"выйду ночью в поле с конём".encode('utf-8') - with self.assertRaises(google.api_core.exceptions.NotFound): - with smart_open.gcs.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: - fout.write(expected) - - def test_read_nonexisting_key(self): - with self.assertRaises(AttributeError): - with smart_open.gcs.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: - fin.read() - - def test_double_close(self): - text = u'там за туманами, вечными, пьяными'.encode('utf-8') - fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') - fout.write(text) - fout.close() - fout.close() - - def test_flush_close(self): - text = u'там за туманами, вечными, пьяными'.encode('utf-8') - fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') - fout.write(text) - fout.flush() - fout.close() - - def test_terminate(self): - #TODO: Write this test - pass - - -class OpenTest(unittest.TestCase): - def setUp(self): - ignore_resource_warnings() - - def tearDown(self): - cleanup_bucket() - - def test_read_never_returns_none(self): - """read should never return None.""" - test_string = u"ветер по морю гуляет..." - with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "wb") as fout: - fout.write(test_string.encode('utf8')) - - r = smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "rb") - self.assertEqual(r.read(), test_string.encode("utf-8")) - self.assertEqual(r.read(), b"") - self.assertEqual(r.read(), b"") + with smart_open.open(HTTPS_URL, "rb") as fin: + read_bytes_1 = fin.read(size=10) + fin.seek(-10, whence=smart_open.s3.CURRENT) + read_bytes_2 = fin.read(size=10) + self.assertEqual(read_bytes_1, read_bytes_2) class ClampTest(unittest.TestCase): @@ -413,6 +129,16 @@ def test(self): self.assertEqual(smart_open.gcs.clamp(-1, 0, 10), 0) +class MakeRangeStringTest(unittest.TestCase): + def test_no_stop(self): + start, stop, end = 1, None, 2 + self.assertEqual(smart_open.gcs.make_range_string(start, stop), 'bytes 1-/*') + + def test_stop(self): + start, stop, end = 1, 2, smart_open.gcs._UNKNOWN_FILE_SIZE + self.assertEqual(smart_open.gcs.make_range_string(start, stop), 'bytes 1-2/*') + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) unittest.main() From d054a8730483d51bf3787ae9b0614fb5cb49a34f Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Fri, 15 Nov 2019 08:46:02 -0500 Subject: [PATCH 05/84] All unit tests passing with mocks --- integration-tests/test_gcs.py | 414 ------------------- smart_open/gcs.py | 9 +- smart_open/tests/test_gcs.py | 737 ++++++++++++++++++++++++++++------ 3 files changed, 630 insertions(+), 530 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index cc2edf3a..e69de29b 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -1,414 +0,0 @@ -import gzip -import io -import logging -import time -import uuid -import unittest -import warnings - -import google.cloud -import google.api_core.exceptions -import six - -import smart_open - -BUCKET_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) -BLOB_NAME = 'test-blob' -WRITE_BLOB_NAME = 'test-write-blob' - -logger = logging.getLogger(__name__) -storage_client = google.cloud.storage.Client() - - -def setUpModule(): - '''Called once by unittest when initializing this module. Sets up the - test GCS bucket. - - ''' - storage_client.create_bucket(BUCKET_NAME) - - -def tearDownModule(): - '''Called once by unittest when tearing down this module. Empties and - removes the test GCS bucket. - - ''' - try: - bucket = get_bucket() - bucket.delete() - except google.cloud.exceptions.NotFound: - pass - - -def get_bucket(): - return storage_client.bucket(BUCKET_NAME) - - -def get_blob(): - bucket = get_bucket() - return bucket.blob(BLOB_NAME) - - -def cleanup_bucket(): - bucket = get_bucket() - - blobs = bucket.list_blobs() - for blob in blobs: - blob.delete() - - -def put_to_bucket(contents, num_attempts=12, sleep_time=5): - logger.debug('%r', locals()) - - # - # In real life, it can take a few seconds for the bucket to become ready. - # If we try to write to the key while the bucket while it isn't ready, we - # will get a StorageError: NotFound. - # - for attempt in range(num_attempts): - try: - blob = get_blob() - blob.upload_from_string(contents) - return - except google.cloud.exceptions.NotFound as err: - logger.error('caught %r, retrying', err) - time.sleep(sleep_time) - - assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) - - -def ignore_resource_warnings(): - if six.PY2: - return - warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") - - -class SeekableBufferedInputBaseTest(unittest.TestCase): - def setUp(self): - # lower the multipart upload size, to speed up these tests - self.old_min_buffer_size = smart_open.gcs.DEFAULT_BUFFER_SIZE - smart_open.gcs.DEFAULT_BUFFER_SIZE = 5 * 1024**2 - - ignore_resource_warnings() - - def tearDown(self): - cleanup_bucket() - - def test_iter(self): - """Are GCS files iterated over correctly?""" - # a list of strings to test with - expected = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=expected) - - # connect to fake GCS and read from the fake key we filled above - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - output = [line.rstrip(b'\n') for line in fin] - self.assertEqual(output, expected.split(b'\n')) - - def test_iter_context_manager(self): - # same thing but using a context manager - expected = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=expected) - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - output = [line.rstrip(b'\n') for line in fin] - self.assertEqual(output, expected.split(b'\n')) - - def test_read(self): - """Are GCS files read correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - logger.debug('content: %r len: %r', content, len(content)) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - self.assertEqual(content[:6], fin.read(6)) - self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes - self.assertEqual(content[14:], fin.read()) # read the rest - - def test_seek_beginning(self): - """Does seeking to the beginning of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - self.assertEqual(content[:6], fin.read(6)) - self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes - - fin.seek(0) - self.assertEqual(content, fin.read()) # no size given => read whole file - - fin.seek(0) - self.assertEqual(content, fin.read(-1)) # same thing - - def test_seek_start(self): - """Does seeking from the start of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - seek = fin.seek(6) - self.assertEqual(seek, 6) - self.assertEqual(fin.tell(), 6) - self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) - - def test_seek_current(self): - """Does seeking from the middle of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - self.assertEqual(fin.read(5), b'hello') - seek = fin.seek(1, whence=smart_open.gcs.CURRENT) - self.assertEqual(seek, 6) - self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) - - def test_seek_end(self): - """Does seeking from the end of GCS files work correctly?""" - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - seek = fin.seek(-4, whence=smart_open.gcs.END) - self.assertEqual(seek, len(content) - 4) - self.assertEqual(fin.read(), b'you?') - - def test_detect_eof(self): - content = u"hello wořld\nhow are you?".encode('utf8') - put_to_bucket(contents=content) - - fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - fin.read() - eof = fin.tell() - self.assertEqual(eof, len(content)) - fin.seek(0, whence=smart_open.gcs.END) - self.assertEqual(eof, fin.tell()) - - def test_read_gzip(self): - expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') - buf = io.BytesIO() - buf.close = lambda: None # keep buffer open so that we can .getvalue() - with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: - zipfile.write(expected) - put_to_bucket(contents=buf.getvalue()) - - # - # Make sure we're reading things correctly. - # - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - self.assertEqual(fin.read(), buf.getvalue()) - - # - # Make sure the buffer we wrote is legitimate gzip. - # - sanity_buf = io.BytesIO(buf.getvalue()) - with gzip.GzipFile(fileobj=sanity_buf) as zipfile: - self.assertEqual(zipfile.read(), expected) - - logger.debug('starting actual test') - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - with gzip.GzipFile(fileobj=fin) as zipfile: - actual = zipfile.read() - - self.assertEqual(expected, actual) - - def test_readline(self): - content = b'englishman\nin\nnew\nyork\n' - put_to_bucket(contents=content) - - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - fin.readline() - self.assertEqual(fin.tell(), content.index(b'\n')+1) - - fin.seek(0) - actual = list(fin) - self.assertEqual(fin.tell(), len(content)) - - expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] - self.assertEqual(expected, actual) - - def test_readline_tiny_buffer(self): - content = b'englishman\nin\nnew\nyork\n' - put_to_bucket(contents=content) - - with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: - actual = list(fin) - - expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] - self.assertEqual(expected, actual) - - def test_read0_does_not_return_data(self): - content = b'englishman\nin\nnew\nyork\n' - put_to_bucket(contents=content) - - with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: - data = fin.read(0) - - self.assertEqual(data, b'') - - -class BufferedOutputBaseTest(unittest.TestCase): - """ - Test writing into GCS files. - - """ - def setUp(self): - ignore_resource_warnings() - - def tearDown(self): - cleanup_bucket() - - def test_write_01(self): - """Does writing into GCS work correctly?""" - test_string = u"žluťoučký koníček".encode('utf8') - - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: - fout.write(test_string) - - output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb")) - - self.assertEqual(output, [test_string]) - - def test_write_01a(self): - """Does gcs write fail on incorrect input?""" - try: - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: - fin.write(None) - except TypeError: - pass - else: - self.fail() - - def test_write_02(self): - """Does gcs write unicode-utf8 conversion work?""" - smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) - smart_open_write.tell() - logger.info("smart_open_write: %r", smart_open_write) - with smart_open_write as fout: - fout.write(u"testžížáč".encode("utf-8")) - self.assertEqual(fout.tell(), 14) - - def test_write_03(self): - """Does gcs multipart chunking work correctly?""" - # write - smart_open_write = smart_open.gcs.BufferedOutputBase( - BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=10 - ) - with smart_open_write as fout: - fout.write(b"test") - self.assertEqual(fout._buf.tell(), 4) - - fout.write(b"test\n") - self.assertEqual(fout._buf.tell(), 9) - self.assertEqual(fout._total_parts, 0) - - fout.write(b"test") - self.assertEqual(fout._buf.tell(), 0) - self.assertEqual(fout._total_parts, 1) - - # read back the same key and check its content - output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) - self.assertEqual(output, ["testtest\n", "test"]) - - def test_write_04(self): - """Does writing no data cause key with an empty value to be created?""" - smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) - with smart_open_write as fout: # noqa - pass - - # read back the same key and check its content - output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) - - self.assertEqual(output, []) - - def test_gzip(self): - expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: - with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: - zipfile.write(expected) - - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: - with gzip.GzipFile(fileobj=fin) as zipfile: - actual = zipfile.read() - - self.assertEqual(expected, actual) - - def test_buffered_writer_wrapper_works(self): - """ - Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which - passes a memoryview object to the underlying stream in python >= 2.7 - """ - expected = u'не думай о секундах свысока' - - with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: - with io.BufferedWriter(fout) as sub_out: - sub_out.write(expected.encode('utf-8')) - - with smart_open.smart_open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: - with io.TextIOWrapper(fin, encoding='utf-8') as text: - actual = text.read() - - self.assertEqual(expected, actual) - - def test_binary_iterator(self): - expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') - put_to_bucket(contents=b"\n".join(expected)) - with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, 'rb') as fin: - actual = [line.rstrip() for line in fin] - self.assertEqual(expected, actual) - - def test_nonexisting_bucket(self): - expected = u"выйду ночью в поле с конём".encode('utf-8') - with self.assertRaises(google.api_core.exceptions.NotFound): - with smart_open.gcs.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: - fout.write(expected) - - def test_read_nonexisting_key(self): - with self.assertRaises(AttributeError): - with smart_open.gcs.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: - fin.read() - - def test_double_close(self): - text = u'там за туманами, вечными, пьяными'.encode('utf-8') - fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') - fout.write(text) - fout.close() - fout.close() - - def test_flush_close(self): - text = u'там за туманами, вечными, пьяными'.encode('utf-8') - fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') - fout.write(text) - fout.flush() - fout.close() - - def test_terminate(self): - text = u'там за туманами, вечными, пьяными'.encode('utf-8') - fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') - fout.write(text) - fout.terminate() - - with self.assertRaises(AttributeError): - with smart_open.gcs.open(BUCKET_NAME, 'key', 'rb') as fin: - fin.read() - - -class OpenTest(unittest.TestCase): - def setUp(self): - ignore_resource_warnings() - - def tearDown(self): - cleanup_bucket() - - def test_read_never_returns_none(self): - """read should never return None.""" - test_string = u"ветер по морю гуляет..." - with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "wb") as fout: - fout.write(test_string.encode('utf8')) - - r = smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "rb") - self.assertEqual(r.read(), test_string.encode("utf-8")) - self.assertEqual(r.read(), b"") - self.assertEqual(r.read(), b"") - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - unittest.main() diff --git a/smart_open/gcs.py b/smart_open/gcs.py index b79f0680..89aca20d 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -5,7 +5,7 @@ import logging import sys -from google.cloud import storage +from google.cloud import storage, exceptions import google.auth.transport.requests as google_requests import six @@ -16,10 +16,10 @@ READ_BINARY = 'rb' WRITE_BINARY = 'wb' MODES = (READ_BINARY, WRITE_BINARY) -"""Allowed I/O modes for working with S3.""" +"""Allowed I/O modes for working with GCS.""" _BINARY_TYPES = (six.binary_type, bytearray) -"""Allowed binary buffer types for writing to the underlying S3 stream""" +"""Allowed binary buffer types for writing to the underlying GCS stream""" if sys.version_info >= (2, 7): _BINARY_TYPES = (six.binary_type, bytearray, memoryview) @@ -312,7 +312,8 @@ def __init__( bucket = client.get_bucket(bucket) self._blob = bucket.get_blob(key) - assert self._blob.exists(), 'Blob %s does not exist!' % key + if self._blob is None: + raise exceptions.NotFound('blob {} not found in {}'.format(key, bucket)) self._size = self._blob.size if self._blob.size is not None else 0 self._raw_reader = SeekableRawReader(self._blob, self._size) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 76e1a9c4..5d5cd02e 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -1,125 +1,638 @@ +import gzip +import inspect +import io import logging +import os +import time +import uuid import unittest +from unittest import mock +import warnings + +import google.cloud +import google.api_core.exceptions +import six import smart_open -BYTES = b'i tried so hard and got so far but in the end it doesn\'t even matter' -HEADERS = { - 'Content-Length': str(len(BYTES)), -} +BUCKET_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) +BLOB_NAME = 'test-blob' +WRITE_BLOB_NAME = 'test-write-blob' +DISABLE_MOCKS = os.environ.get('SO_DISABLE_MOCKS') == "1" logger = logging.getLogger(__name__) -class GCSTest(unittest.TestCase): - @responses.activate - def test_read_all(self): - responses.add(responses.GET, URL, body=BYTES, stream=True) - reader = smart_open.http.SeekableBufferedInputBase(URL) - read_bytes = reader.read() - self.assertEqual(BYTES, read_bytes) - - @responses.activate - def test_seek_from_start(self): - responses.add_callback(responses.GET, URL, callback=request_callback) - reader = smart_open.http.SeekableBufferedInputBase(URL) - - reader.seek(10) - self.assertEqual(reader.tell(), 10) - read_bytes = reader.read(size=10) - self.assertEqual(reader.tell(), 20) - self.assertEqual(BYTES[10:20], read_bytes) - - reader.seek(20) - read_bytes = reader.read(size=10) - self.assertEqual(BYTES[20:30], read_bytes) - - reader.seek(0) - read_bytes = reader.read(size=10) - self.assertEqual(BYTES[:10], read_bytes) - - @responses.activate - def test_seek_from_current(self): - responses.add_callback(responses.GET, URL, callback=request_callback) - reader = smart_open.http.SeekableBufferedInputBase(URL) - - reader.seek(10) - read_bytes = reader.read(size=10) - self.assertEqual(BYTES[10:20], read_bytes) - - self.assertEqual(reader.tell(), 20) - reader.seek(10, whence=smart_open.s3.CURRENT) - self.assertEqual(reader.tell(), 30) - read_bytes = reader.read(size=10) - self.assertEqual(reader.tell(), 40) - self.assertEqual(BYTES[30:40], read_bytes) - - @responses.activate - def test_seek_from_end(self): - responses.add_callback(responses.GET, URL, callback=request_callback) - reader = smart_open.http.SeekableBufferedInputBase(URL) - - reader.seek(-10, whence=smart_open.s3.END) - self.assertEqual(reader.tell(), len(BYTES) - 10) - read_bytes = reader.read(size=10) - self.assertEqual(reader.tell(), len(BYTES)) - self.assertEqual(BYTES[-10:], read_bytes) - - @responses.activate - def test_headers_are_as_assigned(self): - responses.add_callback(responses.GET, URL, callback=request_callback) - - # use default _HEADERS - x = smart_open.http.BufferedInputBase(URL) - # set different ones - x.headers['Accept-Encoding'] = 'compress, gzip' - x.headers['Other-Header'] = 'value' - - # use default again, global shoudn't overwritten from x - y = smart_open.http.BufferedInputBase(URL) - # should be default headers - self.assertEqual(y.headers, {'Accept-Encoding': 'identity'}) - # should be assigned headers - self.assertEqual(x.headers, {'Accept-Encoding': 'compress, gzip', 'Other-Header': 'value'}) - - @responses.activate - def test_headers(self): - """Does the top-level http.open function handle headers correctly?""" - responses.add_callback(responses.GET, URL, callback=request_callback) - reader = smart_open.http.open(URL, 'rb', headers={'Foo': 'bar'}) - self.assertEqual(reader.headers['Foo'], 'bar') - - @responses.activate - def test_https_seek_start(self): - """Did the seek start over HTTPS work?""" - responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) - - with smart_open.open(HTTPS_URL, "rb") as fin: - read_bytes_1 = fin.read(size=10) +def ignore_resource_warnings(): + if six.PY2: + return + warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") + + +class FakeBucket(object): + def __init__(self, client, name=None): + self.client = client # type: FakeClient + self.name = name + self.blobs = [] + self.exists = True + + def blob(self, blob_id): + blob = next((x for x in self.blobs if x.name == blob_id), None) + if blob is None: + blob = FakeBlob(blob_id, self) + return blob + + def delete(self): + self.client._delete_bucket(self) + self._exists = False + + def exists(self): + return self._exists + + def get_blob(self, blob_id): + blob = next((x for x in self.blobs if x.name == blob_id), None) + if blob is None: + raise google.cloud.exceptions.NotFound('Blob {} not found'.format(blob_id)) + return blob + + def list_blobs(self): + return self.blobs + + def _delete_blob(self, blob): + self.blobs.remove(blob) + + +class FakeBlob(object): + RESUMABLE_SESSION_URI_TEMPLATE = ( + 'https://www.googleapis.com/upload/storage/v1/b/' + '{bucket}' + '/o?uploadType=resumable&upload_id=' + '{upload_id}' + ) + + def __init__(self, name, bucket, create=True): + self.name = name + self._bucket = bucket # type: FakeBucket + self._exists = False + self.__contents = io.BytesIO() + + if create: + self._create_if_not_exists() + + def create_resumable_upload_session(self): + resumeable_upload_url = self.RESUMABLE_SESSION_URI_TEMPLATE.format( + bucket=self._bucket.name, + upload_id=uuid.uuid4().hex, + ) + self._bucket.client._uploads[resumeable_upload_url] = self + return resumeable_upload_url + + def delete(self): + self._bucket._delete_blob(self) + self._created = False + + def download_as_string(self, start=None, end=None): + if start is None: + start = 0 + if end is None: + end = self.__contents.tell() + self.__contents.seek(start) + return self.__contents.read(end - start) + + def exists(self, client=None): + return self._exists + + def upload_from_string(self, str): + self.__contents.write(str) + + def write(self, data): + self.__contents.write(data) + + @property + def bucket(self): + return self._bucket + + @property + def size(self): + if self.__contents.tell() == 0: + return None + return self.__contents.tell() + + def _create_if_not_exists(self): + if self not in self._bucket.blobs: + self._bucket.blobs.append(self) + self._exists = True + + +class FakeCredentials(object): + def __init__(self, client): + self.client = client # type: FakeClient + + def before_request(self, *args, **kwargs): + pass + + +class FakeClient(object): + def __init__(self, credentials=None): + if credentials is None: + credentials = FakeCredentials(self) + self._credentials = credentials # type: FakeCredentials + self._uploads = {} + self.__buckets = [] + + def bucket(self, bucket_id): + bucket = next((x for x in self.__buckets if x.name == bucket_id), None) + if bucket is None: + raise google.cloud.exceptions.NotFound('Bucket {} not found'.format(bucket_id)) + return bucket + + def create_bucket(self, bucket_id): + bucket = FakeBucket(self, bucket_id) + self.__buckets.append(bucket) + return bucket + + def get_bucket(self, bucket_id): + return self.bucket(bucket_id) + + def _delete_bucket(self, bucket): + self.__buckets.remove(bucket) + + +class FakeBlobUpload(object): + def __init__(self, url, blob): + self.url = url + self.blob = blob # type: FakeBlob + + +class FakeResponse(object): + def __init__(self, status_code=200): + self.status_code = status_code + + +class FakeAuthorizedSession(object): + def __init__(self, credentials): + self._credentials = credentials # type: FakeCredentials + + def delete(self, upload_url): + blob = self._credentials.client._uploads[upload_url] + blob.delete() + self._credentials.client._uploads.pop(upload_url) + + def put(self, url, data=None, headers=None): + if data is not None: + self._credentials.client._uploads[url].write(data.read()) + return FakeResponse() + + def _blob_with_url(self, + url, + client # type: FakeClient + ): + return client._uploads.get(url) + + +if DISABLE_MOCKS: + storage_client = google.cloud.storage.Client() +else: + storage_client = FakeClient() + + +def get_bucket(): + return storage_client.bucket(BUCKET_NAME) + + +def get_blob(): + bucket = get_bucket() + return bucket.blob(BLOB_NAME) + + +def cleanup_bucket(): + bucket = get_bucket() + + blobs = bucket.list_blobs() + for blob in blobs: + blob.delete() + + +def put_to_bucket(contents, num_attempts=12, sleep_time=5): + logger.debug('%r', locals()) + + # + # In real life, it can take a few seconds for the bucket to become ready. + # If we try to write to the key while the bucket while it isn't ready, we + # will get a StorageError: NotFound. + # + for attempt in range(num_attempts): + try: + blob = get_blob() + blob.upload_from_string(contents) + return + except google.cloud.exceptions.NotFound as err: + logger.error('caught %r, retrying', err) + time.sleep(sleep_time) + + assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) + + +def mock_gcs(func): + '''Mocks the function and provides additional required arguments.''' + def inner(*args, **kwargs): + with mock.patch( + 'google.cloud.storage.Client', + return_value=storage_client, + ) as mocked_client, \ + mock.patch( + 'smart_open.gcs.google_requests.AuthorizedSession', + return_value=FakeAuthorizedSession(storage_client._credentials), + ) as mocked_session: + assert callable(func), 'you didn\'t provide a function!' + try: # is it a method that needs a self arg and mock args? + self_arg = inspect.signature(func).self + func(self_arg, mocked_client, mocked_session, *args, **kwargs) + except AttributeError: + func(*args, **kwargs) + return inner + + +def maybe_mock_gcs(func): + if DISABLE_MOCKS: + return func + else: + return mock_gcs(func) + + +@maybe_mock_gcs +def setUpModule(): + '''Called once by unittest when initializing this module. Sets up the + test GCS bucket. + + ''' + storage_client.create_bucket(BUCKET_NAME) + + +@maybe_mock_gcs +def tearDownModule(): + '''Called once by unittest when tearing down this module. Empties and + removes the test GCS bucket. + + ''' + try: + bucket = get_bucket() + bucket.delete() + except google.cloud.exceptions.NotFound: + pass + + +class SeekableBufferedInputBaseTest(unittest.TestCase): + def setUp(self): + # lower the multipart upload size, to speed up these tests + self.old_min_buffer_size = smart_open.gcs.DEFAULT_BUFFER_SIZE + smart_open.gcs.DEFAULT_BUFFER_SIZE = 5 * 1024**2 + + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + @maybe_mock_gcs + def test_iter(self, *args): + """Are GCS files iterated over correctly?""" + # a list of strings to test with + expected = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=expected) + + # connect to fake GCS and read from the fake key we filled above + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + output = [line.rstrip(b'\n') for line in fin] + self.assertEqual(output, expected.split(b'\n')) + + @maybe_mock_gcs + def test_iter_context_manager(self, *args): + # same thing but using a context manager + expected = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=expected) + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + output = [line.rstrip(b'\n') for line in fin] + self.assertEqual(output, expected.split(b'\n')) + + @maybe_mock_gcs + def test_read(self, *args): + """Are GCS files read correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + logger.debug('content: %r len: %r', content, len(content)) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(content[:6], fin.read(6)) + self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes + self.assertEqual(content[14:], fin.read()) # read the rest + + @maybe_mock_gcs + def test_seek_beginning(self, *args): + """Does seeking to the beginning of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(content[:6], fin.read(6)) + self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes + + fin.seek(0) + self.assertEqual(content, fin.read()) # no size given => read whole file + + fin.seek(0) + self.assertEqual(content, fin.read(-1)) # same thing + + @maybe_mock_gcs + def test_seek_start(self, *args): + """Does seeking from the start of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + seek = fin.seek(6) + self.assertEqual(seek, 6) + self.assertEqual(fin.tell(), 6) + self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) + + @maybe_mock_gcs + def test_seek_current(self, *args): + """Does seeking from the middle of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + self.assertEqual(fin.read(5), b'hello') + seek = fin.seek(1, whence=smart_open.gcs.CURRENT) + self.assertEqual(seek, 6) + self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) + + @maybe_mock_gcs + def test_seek_end(self, *args): + """Does seeking from the end of GCS files work correctly?""" + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + seek = fin.seek(-4, whence=smart_open.gcs.END) + self.assertEqual(seek, len(content) - 4) + self.assertEqual(fin.read(), b'you?') + + @maybe_mock_gcs + def test_detect_eof(self, *args): + content = u"hello wořld\nhow are you?".encode('utf8') + put_to_bucket(contents=content) + + fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) + fin.read() + eof = fin.tell() + self.assertEqual(eof, len(content)) + fin.seek(0, whence=smart_open.gcs.END) + self.assertEqual(eof, fin.tell()) + + @maybe_mock_gcs + def test_read_gzip(self, *args): + expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') + buf = io.BytesIO() + buf.close = lambda: None # keep buffer open so that we can .getvalue() + with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: + zipfile.write(expected) + put_to_bucket(contents=buf.getvalue()) + + # + # Make sure we're reading things correctly. + # + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + self.assertEqual(fin.read(), buf.getvalue()) + + # + # Make sure the buffer we wrote is legitimate gzip. + # + sanity_buf = io.BytesIO(buf.getvalue()) + with gzip.GzipFile(fileobj=sanity_buf) as zipfile: + self.assertEqual(zipfile.read(), expected) + + logger.debug('starting actual test') + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + with gzip.GzipFile(fileobj=fin) as zipfile: + actual = zipfile.read() + + self.assertEqual(expected, actual) + + @maybe_mock_gcs + def test_readline(self, *args): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + fin.readline() + self.assertEqual(fin.tell(), content.index(b'\n')+1) + fin.seek(0) - read_bytes_2 = fin.read(size=10) - self.assertEqual(read_bytes_1, read_bytes_2) - - @responses.activate - def test_https_seek_forward(self): - """Did the seek forward over HTTPS work?""" - responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) - - with smart_open.open(HTTPS_URL, "rb") as fin: - fin.seek(10) - read_bytes = fin.read(size=10) - self.assertEqual(BYTES[10:20], read_bytes) - - @responses.activate - def test_https_seek_reverse(self): - """Did the seek in reverse over HTTPS work?""" - responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) - - with smart_open.open(HTTPS_URL, "rb") as fin: - read_bytes_1 = fin.read(size=10) - fin.seek(-10, whence=smart_open.s3.CURRENT) - read_bytes_2 = fin.read(size=10) - self.assertEqual(read_bytes_1, read_bytes_2) + actual = list(fin) + self.assertEqual(fin.tell(), len(content)) + + expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] + self.assertEqual(expected, actual) + + @maybe_mock_gcs + def test_readline_tiny_buffer(self, *args): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: + actual = list(fin) + + expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] + self.assertEqual(expected, actual) + + @maybe_mock_gcs + def test_read0_does_not_return_data(self, *args): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + data = fin.read(0) + + self.assertEqual(data, b'') + + +class BufferedOutputBaseTest(unittest.TestCase): + """ + Test writing into GCS files. + + """ + def setUp(self): + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + @maybe_mock_gcs + def test_write_01(self): + """Does writing into GCS work correctly?""" + test_string = u"žluťoučký koníček".encode('utf8') + + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + fout.write(test_string) + + output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb")) + + self.assertEqual(output, [test_string]) + + @maybe_mock_gcs + def test_write_01a(self): + """Does gcs write fail on incorrect input?""" + try: + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: + fin.write(None) + except TypeError: + pass + else: + self.fail() + + @maybe_mock_gcs + def test_write_02(self): + """Does gcs write unicode-utf8 conversion work?""" + smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) + smart_open_write.tell() + logger.info("smart_open_write: %r", smart_open_write) + with smart_open_write as fout: + fout.write(u"testžížáč".encode("utf-8")) + self.assertEqual(fout.tell(), 14) + + @maybe_mock_gcs + def test_write_03(self): + """Does gcs multipart chunking work correctly?""" + # write + smart_open_write = smart_open.gcs.BufferedOutputBase( + BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=10 + ) + with smart_open_write as fout: + fout.write(b"test") + self.assertEqual(fout._buf.tell(), 4) + + fout.write(b"test\n") + self.assertEqual(fout._buf.tell(), 9) + self.assertEqual(fout._total_parts, 0) + + fout.write(b"test") + self.assertEqual(fout._buf.tell(), 0) + self.assertEqual(fout._total_parts, 1) + + # read back the same key and check its content + output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + self.assertEqual(output, ["testtest\n", "test"]) + + @maybe_mock_gcs + def test_write_04(self): + """Does writing no data cause key with an empty value to be created?""" + smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) + with smart_open_write as fout: # noqa + pass + + # read back the same key and check its content + output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + + self.assertEqual(output, []) + + @maybe_mock_gcs + def test_gzip(self): + expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: + zipfile.write(expected) + + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: + with gzip.GzipFile(fileobj=fin) as zipfile: + actual = zipfile.read() + + self.assertEqual(expected, actual) + + @maybe_mock_gcs + def test_buffered_writer_wrapper_works(self): + """ + Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which + passes a memoryview object to the underlying stream in python >= 2.7 + """ + expected = u'не думай о секундах свысока' + + with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: + with io.BufferedWriter(fout) as sub_out: + sub_out.write(expected.encode('utf-8')) + + with smart_open.smart_open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: + with io.TextIOWrapper(fin, encoding='utf-8') as text: + actual = text.read() + + self.assertEqual(expected, actual) + + @maybe_mock_gcs + def test_binary_iterator(self): + expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') + put_to_bucket(contents=b"\n".join(expected)) + with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, 'rb') as fin: + actual = [line.rstrip() for line in fin] + self.assertEqual(expected, actual) + + @maybe_mock_gcs + def test_nonexisting_bucket(self): + expected = u"выйду ночью в поле с конём".encode('utf-8') + with self.assertRaises(google.api_core.exceptions.NotFound): + with smart_open.gcs.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: + fout.write(expected) + + @maybe_mock_gcs + def test_read_nonexisting_key(self): + with self.assertRaises(google.api_core.exceptions.NotFound): + with smart_open.gcs.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: + fin.read() + + @maybe_mock_gcs + def test_double_close(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.close() + fout.close() + + @maybe_mock_gcs + def test_flush_close(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.flush() + fout.close() + + @maybe_mock_gcs + def test_terminate(self): + text = u'там за туманами, вечными, пьяными'.encode('utf-8') + fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') + fout.write(text) + fout.terminate() + + with self.assertRaises(google.api_core.exceptions.NotFound): + with smart_open.gcs.open(BUCKET_NAME, 'key', 'rb') as fin: + fin.read() + + +class OpenTest(unittest.TestCase): + def setUp(self): + ignore_resource_warnings() + + def tearDown(self): + cleanup_bucket() + + @maybe_mock_gcs + def test_read_never_returns_none(self): + """read should never return None.""" + test_string = u"ветер по морю гуляет..." + with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "wb") as fout: + fout.write(test_string.encode('utf8')) + + r = smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "rb") + self.assertEqual(r.read(), test_string.encode("utf-8")) + self.assertEqual(r.read(), b"") + self.assertEqual(r.read(), b"") class ClampTest(unittest.TestCase): From a5e2a6eabd868f9b0c1e36d0e96acdb5def7bf9f Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Fri, 15 Nov 2019 09:00:27 -0500 Subject: [PATCH 06/84] Add gcs integration tests --- integration-tests/test_gcs.py | 129 ++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index e69de29b..20bdb3a8 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- +import io +import os + +from google.cloud import storage + +import smart_open + +_GCS_BUCKET = os.environ.get('SO_GCS_BUCKET') +_GCS_URL = 'gs://' + _GCS_BUCKET +assert _GCS_BUCKET is not None, 'please set the SO_GCS_BUCKET environment variable' + + +def initialize_bucket(): + storage_client = storage.Client() + bucket = storage_client.get_bucket(_GCS_BUCKET) + blobs = bucket.list_blobs() + for blob in blobs: + blob.delete() + + +def write_read(key, content, write_mode, read_mode, encoding=None, **kwargs): + with smart_open.smart_open(key, write_mode, encoding=encoding, **kwargs) as fout: + fout.write(content) + with smart_open.smart_open(key, read_mode, encoding=encoding, **kwargs) as fin: + actual = fin.read() + return actual + + +def read_length_prefixed_messages(key, read_mode, encoding=None, **kwargs): + with smart_open.smart_open(key, read_mode, encoding=encoding, **kwargs) as fin: + actual = b'' + length_byte = fin.read(1); + while len(length_byte): + actual += length_byte + msg = fin.read(ord(length_byte)) + actual += msg + length_byte = fin.read(1) + return actual + + +def test_gcs_readwrite_text(benchmark): + initialize_bucket() + + key = _GCS_URL + '/sanity.txt' + text = 'с гранатою в кармане, с чекою в руке' + actual = benchmark(write_read, key, text, 'w', 'r', 'utf-8') + assert actual == text + + +def test_gcs_readwrite_text_gzip(benchmark): + initialize_bucket() + + key = _GCS_URL + '/sanity.txt.gz' + text = 'не чайки здесь запели на знакомом языке' + actual = benchmark(write_read, key, text, 'w', 'r', 'utf-8') + assert actual == text + + +def test_gcs_readwrite_binary(benchmark): + initialize_bucket() + + key = _GCS_URL + '/sanity.txt' + binary = b'this is a test' + actual = benchmark(write_read, key, binary, 'wb', 'rb') + assert actual == binary + + +def test_gcs_readwrite_binary_gzip(benchmark): + initialize_bucket() + + key = _GCS_URL + '/sanity.txt.gz' + binary = b'this is a test' + actual = benchmark(write_read, key, binary, 'wb', 'rb') + assert actual == binary + + +def test_gcs_performance(benchmark): + initialize_bucket() + + one_megabyte = io.BytesIO() + for _ in range(1024*128): + one_megabyte.write(b'01234567') + one_megabyte = one_megabyte.getvalue() + + key = _GCS_URL + '/performance.txt' + actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb') + assert actual == one_megabyte + + +def test_gcs_performance_gz(benchmark): + initialize_bucket() + + one_megabyte = io.BytesIO() + for _ in range(1024*128): + one_megabyte.write(b'01234567') + one_megabyte = one_megabyte.getvalue() + + key = _GCS_URL + '/performance.txt.gz' + actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb') + assert actual == one_megabyte + +def test_gcs_performance_small_reads(benchmark): + initialize_bucket() + + ONE_MIB = 1024**2 + one_megabyte_of_msgs = io.BytesIO() + msg = b'\x0f' + b'0123456789abcde' # a length-prefixed "message" + for _ in range(0, ONE_MIB, len(msg)): + one_megabyte_of_msgs.write(msg) + one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue() + + key = _GCS_URL + '/many_reads_performance.bin' + + with smart_open.smart_open(key, 'wb') as fout: + fout.write(one_megabyte_of_msgs) + + actual = benchmark(read_length_prefixed_messages, key, 'rb', buffer_size=ONE_MIB) + assert actual == one_megabyte_of_msgs + +# def test_gcs_encrypted_file(benchmark): +# initialize_bucket() +# +# key = _GCS_URL + '/sanity.txt' +# text = 'с гранатою в кармане, с чекою в руке' +# actual = benchmark(write_read, key, text, 'w', 'r', 'utf-8', s3_upload={ +# 'ServerSideEncryption': 'AES256' +# }) +# assert actual == text \ No newline at end of file From 4fc43fa975f0e5fb219f4e8e1d4ceb3e7b5ddac1 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 18 Nov 2019 19:41:59 -0500 Subject: [PATCH 07/84] Add .env to .gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index deb05a87..04e894c6 100644 --- a/.gitignore +++ b/.gitignore @@ -58,4 +58,7 @@ target/ *.swo # PyCharm -.idea/ \ No newline at end of file +.idea/ + +# env files +.env From 67feb7bcf460579bf42ae864c6f544d13931fce9 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Sat, 4 Jan 2020 21:04:55 -0500 Subject: [PATCH 08/84] Integration tests passing --- integration-tests/test_gcs.py | 1 + smart_open/gcs.py | 56 +++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index 20bdb3a8..c513a2f9 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -100,6 +100,7 @@ def test_gcs_performance_gz(benchmark): actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb') assert actual == one_megabyte + def test_gcs_performance_small_reads(benchmark): initialize_bucket() diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 89aca20d..baaf14f8 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -25,19 +25,26 @@ _UNKNOWN_FILE_SIZE = '*' +_RESUME_INCOMPLETE = 308 + BINARY_NEWLINE = b'\n' SUPPORTED_SCHEMES = ("gcs", "gs") -DEFAULT_MIN_PART_SIZE = MIN_MIN_PART_SIZE = 256 * 1024 +REQUIRED_CHUNK_MULTIPLE = 256 * 1024 +"""Google requires you to upload in multiples of 256 KB, except for the last part.""" + +MIN_MIN_PART_SIZE = 256 * 1024 """The absolute minimum permitted by Google.""" -DEFAULT_BUFFER_SIZE = 128 * 1024 +DEFAULT_BUFFER_SIZE = 256 * 1024 START = 0 CURRENT = 1 END = 2 WHENCE_CHOICES = [START, CURRENT, END] +SUCCESSFUL_STATUS_CODES = (200, 201) + def clamp(value, minval, maxval): return max(min(value, maxval), minval) @@ -52,12 +59,15 @@ def make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): return 'bytes %d-%d/%s' % (start, stop, end) +class UploadFailedError(Exception): + """Raised when a multi-part upload to GCS returns a failed response status code.""" + + def open( bucket_id, blob_id, mode, buffer_size=DEFAULT_BUFFER_SIZE, - min_part_size=DEFAULT_MIN_PART_SIZE, client=None, # type: storage.Client ): """Open an GCS blob for reading or writing. @@ -72,8 +82,6 @@ def open( The mode for opening the object. Must be either "rb" or "wb". buffer_size: int, optional The buffer size to use when performing I/O. - min_part_size: int, optional - The minimum part size for multipart uploads. For writing only. client: object, optional The GCS client to use when working with google-cloud-storage. @@ -90,7 +98,7 @@ def open( return BufferedOutputBase( bucket_id, blob_id, - min_part_size=min_part_size, + buffer_size=buffer_size, client=client, ) else: @@ -118,9 +126,9 @@ class SeekableRawReader(object): """Read an GCS object.""" def __init__( - self, - gcs_blob, # type: storage.Blob - size, + self, + gcs_blob, # type: storage.Blob + size, ): self._blob = gcs_blob self._size = size @@ -378,7 +386,7 @@ def __init__( self, bucket, blob, - min_part_size=DEFAULT_MIN_PART_SIZE, + buffer_size=DEFAULT_BUFFER_SIZE, client=None, # type: storage.Client ): if client is None: @@ -387,8 +395,9 @@ def __init__( self._credentials = self._client._credentials self._bucket = self._client.bucket(bucket) # type: storage.Bucket self._blob = self._bucket.blob(blob) # type: storage.Blob + assert buffer_size % MIN_MIN_PART_SIZE == 0, 'buffer size must be a multiple of 256KB' + self._buffer_size = buffer_size - self._min_part_size = min_part_size self._total_size = 0 self._total_parts = 0 self._buf = io.BytesIO() @@ -443,7 +452,7 @@ def write(self, b): self._buf.write(b) self._total_size += len(b) - if self._buf.tell() >= self._min_part_size: + if self._buf.tell() >= self._buffer_size: self._upload_next_part() return len(b) @@ -462,19 +471,34 @@ def _upload_next_part(self): content_length = self._buf.tell() start = self._total_size - content_length stop = self._total_size - 1 - if content_length < MIN_MIN_PART_SIZE: + if content_length != self._buffer_size: end = content_length else: end = _UNKNOWN_FILE_SIZE + if content_length != REQUIRED_CHUNK_MULTIPLE: + stop = content_length // REQUIRED_CHUNK_MULTIPLE * REQUIRED_CHUNK_MULTIPLE - 1 + self._buf.seek(0) headers = { 'Content-Length': str(content_length), 'Content-Range': make_range_string(start, stop, end) } - # TODO: Add error handling / retrying here - response = self._session.put(self._resumeable_upload_url, data=self._buf, headers=headers) - assert response.status_code in (200, 201) + data = self._buf + response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) + # TODO: Figure out a way to avoid sending another request when the last upload part + # is a multiple of the min part size + if response.status_code == _RESUME_INCOMPLETE: + end = content_length + headers = { + 'Content-Length': str(content_length), + 'Content-Range': make_range_string(start, stop, end) + } + response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) + if response.status_code not in SUCCESSFUL_STATUS_CODES: + logger.error("upload failed with status %s", response.status_code) + logger.error("response message: %s", str(response.json())) + raise UploadFailedError logger.debug("upload of part #%i finished" % part_num) self._total_parts += 1 From 62252f01ad04094d95119a6dfc3ff2f1180233a6 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Sat, 4 Jan 2020 21:07:45 -0500 Subject: [PATCH 09/84] Remove kms integration test --- integration-tests/test_gcs.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index c513a2f9..e2f8760f 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -118,13 +118,3 @@ def test_gcs_performance_small_reads(benchmark): actual = benchmark(read_length_prefixed_messages, key, 'rb', buffer_size=ONE_MIB) assert actual == one_megabyte_of_msgs - -# def test_gcs_encrypted_file(benchmark): -# initialize_bucket() -# -# key = _GCS_URL + '/sanity.txt' -# text = 'с гранатою в кармане, с чекою в руке' -# actual = benchmark(write_read, key, text, 'w', 'r', 'utf-8', s3_upload={ -# 'ServerSideEncryption': 'AES256' -# }) -# assert actual == text \ No newline at end of file From 40cbb7d52e360b47baf83ade6f75f22ed7e91a5a Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Sat, 4 Jan 2020 21:33:59 -0500 Subject: [PATCH 10/84] Fix failing test --- smart_open/gcs.py | 5 +---- smart_open/tests/test_gcs.py | 14 +++++++------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index baaf14f8..fabd5ef0 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -490,10 +490,7 @@ def _upload_next_part(self): # is a multiple of the min part size if response.status_code == _RESUME_INCOMPLETE: end = content_length - headers = { - 'Content-Length': str(content_length), - 'Content-Range': make_range_string(start, stop, end) - } + headers['Content-Range'] = make_range_string(start, stop, end) response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) if response.status_code not in SUCCESSFUL_STATUS_CODES: logger.error("upload failed with status %s", response.status_code) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 5d5cd02e..cb4c2e81 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -505,23 +505,23 @@ def test_write_03(self): """Does gcs multipart chunking work correctly?""" # write smart_open_write = smart_open.gcs.BufferedOutputBase( - BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=10 + BUCKET_NAME, WRITE_BLOB_NAME, buffer_size=256 * 1024 ) with smart_open_write as fout: - fout.write(b"test") - self.assertEqual(fout._buf.tell(), 4) + fout.write(b"t" * 262141) + self.assertEqual(fout._buf.tell(), 262141) - fout.write(b"test\n") - self.assertEqual(fout._buf.tell(), 9) + fout.write(b"t\n") + self.assertEqual(fout._buf.tell(), 262143) self.assertEqual(fout._total_parts, 0) - fout.write(b"test") + fout.write(b"t") self.assertEqual(fout._buf.tell(), 0) self.assertEqual(fout._total_parts, 1) # read back the same key and check its content output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) - self.assertEqual(output, ["testtest\n", "test"]) + self.assertEqual(output, ["t" * 262142 + '\n', "t"]) @maybe_mock_gcs def test_write_04(self): From 5769e9be976ff76c46cb425a352542424c5f750f Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Sat, 4 Jan 2020 21:41:09 -0500 Subject: [PATCH 11/84] Change writer from buffer size to min part size --- smart_open/gcs.py | 11 +++++++---- smart_open/tests/test_gcs.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index fabd5ef0..61ec746f 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -68,6 +68,7 @@ def open( blob_id, mode, buffer_size=DEFAULT_BUFFER_SIZE, + min_part_size=MIN_MIN_PART_SIZE, client=None, # type: storage.Client ): """Open an GCS blob for reading or writing. @@ -82,6 +83,8 @@ def open( The mode for opening the object. Must be either "rb" or "wb". buffer_size: int, optional The buffer size to use when performing I/O. + min_part_size: int, optional + The minimum part size for multipart uploads. For writing only. client: object, optional The GCS client to use when working with google-cloud-storage. @@ -98,7 +101,7 @@ def open( return BufferedOutputBase( bucket_id, blob_id, - buffer_size=buffer_size, + min_part_size=min_part_size, client=client, ) else: @@ -386,7 +389,7 @@ def __init__( self, bucket, blob, - buffer_size=DEFAULT_BUFFER_SIZE, + min_part_size=MIN_MIN_PART_SIZE, client=None, # type: storage.Client ): if client is None: @@ -395,8 +398,8 @@ def __init__( self._credentials = self._client._credentials self._bucket = self._client.bucket(bucket) # type: storage.Bucket self._blob = self._bucket.blob(blob) # type: storage.Blob - assert buffer_size % MIN_MIN_PART_SIZE == 0, 'buffer size must be a multiple of 256KB' - self._buffer_size = buffer_size + assert min_part_mize % MIN_MIN_PART_SIZE == 0, 'buffer size must be a multiple of 256KB' + self._min_part_size = min_part_size self._total_size = 0 self._total_parts = 0 diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index cb4c2e81..4f0d5dcb 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -505,7 +505,7 @@ def test_write_03(self): """Does gcs multipart chunking work correctly?""" # write smart_open_write = smart_open.gcs.BufferedOutputBase( - BUCKET_NAME, WRITE_BLOB_NAME, buffer_size=256 * 1024 + BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=256 * 1024 ) with smart_open_write as fout: fout.write(b"t" * 262141) From c31f79d0bafa47fa469130462594f5d71adb8477 Mon Sep 17 00:00:00 2001 From: Peter Date: Sat, 4 Jan 2020 23:39:12 -0500 Subject: [PATCH 12/84] Fix typo --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 61ec746f..6ac172de 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -398,7 +398,7 @@ def __init__( self._credentials = self._client._credentials self._bucket = self._client.bucket(bucket) # type: storage.Bucket self._blob = self._bucket.blob(blob) # type: storage.Blob - assert min_part_mize % MIN_MIN_PART_SIZE == 0, 'buffer size must be a multiple of 256KB' + assert min_part_size % MIN_MIN_PART_SIZE == 0, 'min part size must be a multiple of 256KB' self._min_part_size = min_part_size self._total_size = 0 From e2be2832fee1f2426eb3539a5f109fa40491d450 Mon Sep 17 00:00:00 2001 From: Peter Date: Sat, 4 Jan 2020 23:40:30 -0500 Subject: [PATCH 13/84] Change buffer size to min part size --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 6ac172de..62886fab 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -455,7 +455,7 @@ def write(self, b): self._buf.write(b) self._total_size += len(b) - if self._buf.tell() >= self._buffer_size: + if self._buf.tell() >= self._min_part_size: self._upload_next_part() return len(b) From 0eca1d386259e14ce1c7173506da7d2b1d7156be Mon Sep 17 00:00:00 2001 From: Peter Date: Sat, 4 Jan 2020 23:45:24 -0500 Subject: [PATCH 14/84] Fix another missed min_part_size --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 62886fab..434fa253 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -474,7 +474,7 @@ def _upload_next_part(self): content_length = self._buf.tell() start = self._total_size - content_length stop = self._total_size - 1 - if content_length != self._buffer_size: + if content_length != self._min_part_size: end = content_length else: end = _UNKNOWN_FILE_SIZE From 9357d484da914c8fa922b42054e570a0d98b7f5a Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 08:21:36 -0500 Subject: [PATCH 15/84] Fix broken example --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index dc1c1549..507504a9 100644 --- a/README.rst +++ b/README.rst @@ -180,7 +180,7 @@ More examples print(line) # stream content *into* GCS (write mode): - with open('gs://my_bucket/my_file.txt') as fout: + with open('gs://my_bucket/my_file.txt', 'wb') as fout: fout.write(b'hello world') Supported Compression Formats From eb2817b44f20528f3b530b1fbaf6c06ea8514f27 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 08:25:16 -0500 Subject: [PATCH 16/84] Change smart_open to open --- integration-tests/test_gcs.py | 8 ++++---- smart_open/tests/test_gcs.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index e2f8760f..7df8a937 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -20,15 +20,15 @@ def initialize_bucket(): def write_read(key, content, write_mode, read_mode, encoding=None, **kwargs): - with smart_open.smart_open(key, write_mode, encoding=encoding, **kwargs) as fout: + with smart_open.open(key, write_mode, encoding=encoding, **kwargs) as fout: fout.write(content) - with smart_open.smart_open(key, read_mode, encoding=encoding, **kwargs) as fin: + with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: actual = fin.read() return actual def read_length_prefixed_messages(key, read_mode, encoding=None, **kwargs): - with smart_open.smart_open(key, read_mode, encoding=encoding, **kwargs) as fin: + with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: actual = b'' length_byte = fin.read(1); while len(length_byte): @@ -113,7 +113,7 @@ def test_gcs_performance_small_reads(benchmark): key = _GCS_URL + '/many_reads_performance.bin' - with smart_open.smart_open(key, 'wb') as fout: + with smart_open.open(key, 'wb') as fout: fout.write(one_megabyte_of_msgs) actual = benchmark(read_length_prefixed_messages, key, 'rb', buffer_size=ONE_MIB) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 4f0d5dcb..8704be3a 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -560,7 +560,7 @@ def test_buffered_writer_wrapper_works(self): with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) - with smart_open.smart_open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: + with smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() From 00f7c86751509ba6f928b2396c159e50f933af40 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 08:26:28 -0500 Subject: [PATCH 17/84] Make WHENCE_CHOICES a tuple --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 434fa253..d95e3477 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -41,7 +41,7 @@ START = 0 CURRENT = 1 END = 2 -WHENCE_CHOICES = [START, CURRENT, END] +WHENCE_CHOICES = (START, CURRENT, END) SUCCESSFUL_STATUS_CODES = (200, 201) From 2544aca3205d9cb8da9193bd9797399a23f868f3 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 08:27:23 -0500 Subject: [PATCH 18/84] Specify client type --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index d95e3477..dd5b08ee 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -85,7 +85,7 @@ def open( The buffer size to use when performing I/O. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. - client: object, optional + client: google.cloud.storage.Client, optional The GCS client to use when working with google-cloud-storage. """ From d3e0d8c69500dd97a7127f71098f1b8779838082 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 08:30:13 -0500 Subject: [PATCH 19/84] Make init one line instead of multiple --- smart_open/gcs.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index dd5b08ee..a886b8c0 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -110,9 +110,8 @@ def open( class RawReader(object): """Read an GCS blob.""" - def __init__(self, - gcs_blob, # type: storage.Blob - ): + def __init__(self, gcs_blob): + # type: (storage.Blob) -> None self.position = 0 self._blob = gcs_blob self._body = gcs_blob.download_as_string() @@ -128,11 +127,8 @@ def read(self, size=-1): class SeekableRawReader(object): """Read an GCS object.""" - def __init__( - self, - gcs_blob, # type: storage.Blob - size, - ): + def __init__(self, gcs_blob, size): + # type: (storage.Blob, int) -> None self._blob = gcs_blob self._size = size self.seek(0) From b2f38cf7de2a12cf0f46792c5006c45a75a15f07 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 08:31:28 -0500 Subject: [PATCH 20/84] Shorten _parse_uri_gcs --- smart_open/smart_open_lib.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 78c48d66..adff7e8f 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -848,12 +848,9 @@ def _unquote(text): def _parse_uri_gcs(parsed_uri): assert parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES - bucket_id, blob_id = parsed_uri.netloc, parsed_uri.path[1:] - return Uri( - scheme=parsed_uri.scheme, bucket_id=bucket_id, blob_id=blob_id, - ) + return Uri(scheme=parsed_uri.scheme, bucket_id=bucket_id, blob_id=blob_id) def _need_to_buffer(file_obj, mode, ext): From 0afb007e812ad4d5a3afc42216ffb0ba8e07c9fe Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 09:14:19 -0500 Subject: [PATCH 21/84] Use open instead of smart_open --- integration-tests/test_gcs.py | 6 +++--- smart_open/gcs.py | 39 ++++++++++++++++++----------------- smart_open/tests/test_gcs.py | 2 +- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index 7df8a937..c2d6bdd6 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -2,7 +2,7 @@ import io import os -from google.cloud import storage +import google.cloud.storage import smart_open @@ -12,7 +12,7 @@ def initialize_bucket(): - storage_client = storage.Client() + storage_client = google.cloud.storage.Client() bucket = storage_client.get_bucket(_GCS_BUCKET) blobs = bucket.list_blobs() for blob in blobs: @@ -116,5 +116,5 @@ def test_gcs_performance_small_reads(benchmark): with smart_open.open(key, 'wb') as fout: fout.write(one_megabyte_of_msgs) - actual = benchmark(read_length_prefixed_messages, key, 'rb', buffer_size=ONE_MIB) + actual = benchmark(read_length_prefixed_messages, key, 'rb', buffering=ONE_MIB) assert actual == one_megabyte_of_msgs diff --git a/smart_open/gcs.py b/smart_open/gcs.py index a886b8c0..9981dd6f 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -5,7 +5,8 @@ import logging import sys -from google.cloud import storage, exceptions +import google.cloud.exceptions +import google.cloud.storage import google.auth.transport.requests as google_requests import six @@ -67,9 +68,9 @@ def open( bucket_id, blob_id, mode, - buffer_size=DEFAULT_BUFFER_SIZE, + buffering=DEFAULT_BUFFER_SIZE, min_part_size=MIN_MIN_PART_SIZE, - client=None, # type: storage.Client + client=None, # type: google.cloud.storage.Client ): """Open an GCS blob for reading or writing. @@ -81,11 +82,11 @@ def open( The name of the blob within the bucket. mode: str The mode for opening the object. Must be either "rb" or "wb". - buffer_size: int, optional + buffering: int, optional The buffer size to use when performing I/O. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. - client: google.cloud.storage.Client, optional + client: google.cloud.google.cloud.storage.Client, optional The GCS client to use when working with google-cloud-storage. """ @@ -93,7 +94,7 @@ def open( return SeekableBufferedInputBase( bucket_id, blob_id, - buffer_size=buffer_size, + buffering=buffering, line_terminator=BINARY_NEWLINE, client=client, ) @@ -166,12 +167,12 @@ def __init__( self, bucket, key, - buffer_size=DEFAULT_BUFFER_SIZE, + buffering=DEFAULT_BUFFER_SIZE, line_terminator=BINARY_NEWLINE, - client=None, # type: storage.Client + client=None, # type: google.cloud.storage.Client ): if not client: - client = storage.Client() + client = google.cloud.storage.Client() bucket = client.get_bucket(bucket) # type: storage.Bucket @@ -180,8 +181,8 @@ def __init__( self._raw_reader = RawReader(self._blob) self._current_pos = 0 - self._buffer_size = buffer_size - self._buffer = smart_open.bytebuffer.ByteBuffer(buffer_size) + self._buffer_size = buffering + self._buffer = smart_open.bytebuffer.ByteBuffer(buffering) self._eof = False self._line_terminator = line_terminator @@ -310,23 +311,23 @@ def __init__( self, bucket, key, - buffer_size=DEFAULT_BUFFER_SIZE, + buffering=DEFAULT_BUFFER_SIZE, line_terminator=BINARY_NEWLINE, - client=None, # type: storage.Client + client=None, # type: google.cloud.storage.Client ): if not client: - client = storage.Client() + client = google.cloud.storage.Client() bucket = client.get_bucket(bucket) self._blob = bucket.get_blob(key) if self._blob is None: - raise exceptions.NotFound('blob {} not found in {}'.format(key, bucket)) + raise google.cloud.exceptions.NotFound('blob {} not found in {}'.format(key, bucket)) self._size = self._blob.size if self._blob.size is not None else 0 self._raw_reader = SeekableRawReader(self._blob, self._size) self._current_pos = 0 - self._buffer_size = buffer_size - self._buffer = smart_open.bytebuffer.ByteBuffer(buffer_size) + self._buffer_size = buffering + self._buffer = smart_open.bytebuffer.ByteBuffer(buffering) self._eof = False self._line_terminator = line_terminator @@ -386,10 +387,10 @@ def __init__( bucket, blob, min_part_size=MIN_MIN_PART_SIZE, - client=None, # type: storage.Client + client=None, # type: google.cloud.storage.Client ): if client is None: - client = storage.Client() + client = google.cloud.storage.Client() self._client = client self._credentials = self._client._credentials self._bucket = self._client.bucket(bucket) # type: storage.Bucket diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 8704be3a..84571162 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -560,7 +560,7 @@ def test_buffered_writer_wrapper_works(self): with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) - with smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: + with smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), 'rb') as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() From 5b95e873f9b90d732d8b53b534111e080f0b0a87 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 10:28:51 -0500 Subject: [PATCH 22/84] Fix memory issue with RawReader --- smart_open/gcs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 9981dd6f..a96a221d 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -115,14 +115,13 @@ def __init__(self, gcs_blob): # type: (storage.Blob) -> None self.position = 0 self._blob = gcs_blob - self._body = gcs_blob.download_as_string() def read(self, size=-1): if size == -1: - return self._body + return self._blob.download_as_string() start, end = self.position, self.position + size self.position = end - return self._body[start:end] + return self._blob.download_as_string(start=start, end=end) class SeekableRawReader(object): From 81f50aed453a782c9a2c7820b9591064833de835 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 12:41:20 -0500 Subject: [PATCH 23/84] Fix type hints --- smart_open/gcs.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index a96a221d..93212692 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -112,7 +112,7 @@ def open( class RawReader(object): """Read an GCS blob.""" def __init__(self, gcs_blob): - # type: (storage.Blob) -> None + # type: (google.cloud.storage.Blob) -> None self.position = 0 self._blob = gcs_blob @@ -128,10 +128,11 @@ class SeekableRawReader(object): """Read an GCS object.""" def __init__(self, gcs_blob, size): - # type: (storage.Blob, int) -> None + # type: (google.cloud.storage.Blob, int) -> None self._blob = gcs_blob self._size = size - self.seek(0) + self._position = 0 + self._body = None def seek(self, position): """Seek to the specified position (byte offset) in the GCS key. @@ -173,9 +174,9 @@ def __init__( if not client: client = google.cloud.storage.Client() - bucket = client.get_bucket(bucket) # type: storage.Bucket + bucket = client.get_bucket(bucket) # type: google.cloud.storage.Bucket - self._blob = bucket.get_blob(key) # type: storage.Blob + self._blob = bucket.get_blob(key) # type: google.cloud.storage.Blob self._size = self._blob.size if self._blob.size else 0 self._raw_reader = RawReader(self._blob) @@ -316,7 +317,7 @@ def __init__( ): if not client: client = google.cloud.storage.Client() - bucket = client.get_bucket(bucket) + bucket = client.get_bucket(bucket) # type: google.cloud.storage.Bucket self._blob = bucket.get_blob(key) if self._blob is None: @@ -392,8 +393,8 @@ def __init__( client = google.cloud.storage.Client() self._client = client self._credentials = self._client._credentials - self._bucket = self._client.bucket(bucket) # type: storage.Bucket - self._blob = self._bucket.blob(blob) # type: storage.Blob + self._bucket = self._client.bucket(bucket) # type: google.cloud.storage.Bucket + self._blob = self._bucket.blob(blob) # type: google.cloud.storage.Blob assert min_part_size % MIN_MIN_PART_SIZE == 0, 'min part size must be a multiple of 256KB' self._min_part_size = min_part_size From 2f8f22c315a58fd613dbcaf62310be79c262da2d Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 12:42:08 -0500 Subject: [PATCH 24/84] Fix exists in test --- smart_open/tests/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 84571162..8d42a9cb 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -34,7 +34,7 @@ def __init__(self, client, name=None): self.client = client # type: FakeClient self.name = name self.blobs = [] - self.exists = True + self._exists = True def blob(self, blob_id): blob = next((x for x in self.blobs if x.name == blob_id), None) From 7e074847c4d081a904a26e9c73931a3ca9890d8f Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 12:58:19 -0500 Subject: [PATCH 25/84] Fix typo in smart_open_lib about schemes --- smart_open/smart_open_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index adff7e8f..b013410e 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -705,7 +705,7 @@ def _parse_uri(uri_as_string): * webhdfs .s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. - .gcs and gcs are treated the same way. + .gs and gcs are treated the same way. Valid URI examples:: From b779ccd15ecf2ce6974b4f484e3356ebb8eeb645 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 15:54:27 -0500 Subject: [PATCH 26/84] Remove RawReaders and BufferedInputBase --- smart_open/gcs.py | 172 +++++++++++++++++----------------------------- 1 file changed, 62 insertions(+), 110 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 93212692..565505bf 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -109,21 +109,6 @@ def open( raise NotImplementedError('GCS support for mode %r not implemented' % mode) -class RawReader(object): - """Read an GCS blob.""" - def __init__(self, gcs_blob): - # type: (google.cloud.storage.Blob) -> None - self.position = 0 - self._blob = gcs_blob - - def read(self, size=-1): - if size == -1: - return self._blob.download_as_string() - start, end = self.position, self.position + size - self.position = end - return self._blob.download_as_string(start=start, end=end) - - class SeekableRawReader(object): """Read an GCS object.""" @@ -132,7 +117,6 @@ def __init__(self, gcs_blob, size): self._blob = gcs_blob self._size = size self._position = 0 - self._body = None def seek(self, position): """Seek to the specified position (byte offset) in the GCS key. @@ -141,28 +125,34 @@ def seek(self, position): """ self._position = position - if position == self._size == 0 or position == self._size: + def read(self, size=-1): + if self._position >= self._size: + return b'' + binary = self._download_blob_chunk(size) + self._position += len(binary) + return binary + + def _download_blob_chunk(self, size): + position = self._position + if position == self._size: # # When reading, we can't seek to the first byte of an empty file. # Similarly, we can't seek past the last byte. Do nothing here. # - self._body = io.BytesIO() + binary = b'' + elif size == -1: + binary = self._blob.download_as_string(start=position) else: - start, end = position, position + self._size - self._body = self._blob.download_as_string(start=start, end=end) - - def read(self, size=-1): - if self._position >= self._size: - return b'' - if size == -1: - binary = self._body - else: - binary = self._body[:size] - self._position += len(binary) + start, end = position, position + size + binary = self._blob.download_as_string(start=start, end=end) return binary -class BufferedInputBase(io.BufferedIOBase): +class SeekableBufferedInputBase(io.BufferedIOBase): + """Reads bytes from GCS. + + Implements the io.BufferedIOBase interface of the standard library.""" + def __init__( self, bucket, @@ -173,13 +163,14 @@ def __init__( ): if not client: client = google.cloud.storage.Client() - bucket = client.get_bucket(bucket) # type: google.cloud.storage.Bucket - self._blob = bucket.get_blob(key) # type: google.cloud.storage.Blob - self._size = self._blob.size if self._blob.size else 0 + self._blob = bucket.get_blob(key) + if self._blob is None: + raise google.cloud.exceptions.NotFound('blob {} not found in {}'.format(key, bucket)) + self._size = self._blob.size if self._blob.size is not None else 0 - self._raw_reader = RawReader(self._blob) + self._raw_reader = SeekableRawReader(self._blob, self._size) self._current_pos = 0 self._buffer_size = buffering self._buffer = smart_open.bytebuffer.ByteBuffer(buffering) @@ -204,7 +195,10 @@ def readable(self): return True def seekable(self): - return False + """If False, seek(), tell() and truncate() will raise IOError. + + We offer only seek support, and no truncate support.""" + return True # # io.BufferedIOBase methods. @@ -213,6 +207,39 @@ def detach(self): """Unsupported.""" raise io.UnsupportedOperation + def seek(self, offset, whence=START): + """Seek to the specified position. + + :param int offset: The offset in bytes. + :param int whence: Where the offset is from. + + Returns the position after seeking.""" + logger.debug('seeking to offset: %r whence: %r', offset, whence) + if whence not in WHENCE_CHOICES: + raise ValueError('invalid whence, expected one of %r' % WHENCE_CHOICES) + + if whence == START: + new_position = offset + elif whence == CURRENT: + new_position = self._current_pos + offset + else: + new_position = self._size + offset + new_position = clamp(new_position, 0, self._size) + self._current_pos = new_position + self._raw_reader.seek(new_position) + logger.debug('new_position: %r', self._current_pos) + + self._eof = self._current_pos == self._size + return self._current_pos + + def tell(self): + """Return the current position within the file.""" + return self._current_pos + + def truncate(self, size=None): + """Unsupported.""" + raise io.UnsupportedOperation + def read(self, size=-1): """Read up to size bytes from the object and return them.""" if size == 0: @@ -302,81 +329,6 @@ def _fill_buffer(self, size=-1): self._eof = True -class SeekableBufferedInputBase(BufferedInputBase): - """Reads bytes from GCS. - - Implements the io.BufferedIOBase interface of the standard library.""" - - def __init__( - self, - bucket, - key, - buffering=DEFAULT_BUFFER_SIZE, - line_terminator=BINARY_NEWLINE, - client=None, # type: google.cloud.storage.Client - ): - if not client: - client = google.cloud.storage.Client() - bucket = client.get_bucket(bucket) # type: google.cloud.storage.Bucket - - self._blob = bucket.get_blob(key) - if self._blob is None: - raise google.cloud.exceptions.NotFound('blob {} not found in {}'.format(key, bucket)) - self._size = self._blob.size if self._blob.size is not None else 0 - - self._raw_reader = SeekableRawReader(self._blob, self._size) - self._current_pos = 0 - self._buffer_size = buffering - self._buffer = smart_open.bytebuffer.ByteBuffer(buffering) - self._eof = False - self._line_terminator = line_terminator - - # - # This member is part of the io.BufferedIOBase interface. - # - self.raw = None - - def seekable(self): - """If False, seek(), tell() and truncate() will raise IOError. - - We offer only seek support, and no truncate support.""" - return True - - def seek(self, offset, whence=START): - """Seek to the specified position. - - :param int offset: The offset in bytes. - :param int whence: Where the offset is from. - - Returns the position after seeking.""" - logger.debug('seeking to offset: %r whence: %r', offset, whence) - if whence not in WHENCE_CHOICES: - raise ValueError('invalid whence, expected one of %r' % WHENCE_CHOICES) - - if whence == START: - new_position = offset - elif whence == CURRENT: - new_position = self._current_pos + offset - else: - new_position = self._size + offset - new_position = clamp(new_position, 0, self._size) - self._current_pos = new_position - self._raw_reader.seek(new_position) - logger.debug('new_position: %r', self._current_pos) - - self._buffer.empty() - self._eof = self._current_pos == self._size - return self._current_pos - - def tell(self): - """Return the current position within the file.""" - return self._current_pos - - def truncate(self, size=None): - """Unsupported.""" - raise io.UnsupportedOperation - - class BufferedOutputBase(io.BufferedIOBase): """Writes bytes to GCS. From 66a0b33cee062fc760ab363f476e12db2590f591 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 15:54:49 -0500 Subject: [PATCH 27/84] Remove unneeded arg passing in tests --- smart_open/tests/test_gcs.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 8d42a9cb..259b8610 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -245,9 +245,9 @@ def inner(*args, **kwargs): return_value=FakeAuthorizedSession(storage_client._credentials), ) as mocked_session: assert callable(func), 'you didn\'t provide a function!' - try: # is it a method that needs a self arg and mock args? + try: # is it a method that needs a self arg? self_arg = inspect.signature(func).self - func(self_arg, mocked_client, mocked_session, *args, **kwargs) + func(self_arg, *args, **kwargs) except AttributeError: func(*args, **kwargs) return inner @@ -290,11 +290,12 @@ def setUp(self): ignore_resource_warnings() + @maybe_mock_gcs def tearDown(self): cleanup_bucket() @maybe_mock_gcs - def test_iter(self, *args): + def test_iter(self): """Are GCS files iterated over correctly?""" # a list of strings to test with expected = u"hello wořld\nhow are you?".encode('utf8') @@ -306,7 +307,7 @@ def test_iter(self, *args): self.assertEqual(output, expected.split(b'\n')) @maybe_mock_gcs - def test_iter_context_manager(self, *args): + def test_iter_context_manager(self): # same thing but using a context manager expected = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=expected) @@ -315,7 +316,7 @@ def test_iter_context_manager(self, *args): self.assertEqual(output, expected.split(b'\n')) @maybe_mock_gcs - def test_read(self, *args): + def test_read(self): """Are GCS files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) @@ -327,7 +328,7 @@ def test_read(self, *args): self.assertEqual(content[14:], fin.read()) # read the rest @maybe_mock_gcs - def test_seek_beginning(self, *args): + def test_seek_beginning(self): """Does seeking to the beginning of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) @@ -343,7 +344,7 @@ def test_seek_beginning(self, *args): self.assertEqual(content, fin.read(-1)) # same thing @maybe_mock_gcs - def test_seek_start(self, *args): + def test_seek_start(self): """Does seeking from the start of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) @@ -355,7 +356,7 @@ def test_seek_start(self, *args): self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) @maybe_mock_gcs - def test_seek_current(self, *args): + def test_seek_current(self): """Does seeking from the middle of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) @@ -367,7 +368,7 @@ def test_seek_current(self, *args): self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) @maybe_mock_gcs - def test_seek_end(self, *args): + def test_seek_end(self): """Does seeking from the end of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) @@ -378,7 +379,7 @@ def test_seek_end(self, *args): self.assertEqual(fin.read(), b'you?') @maybe_mock_gcs - def test_detect_eof(self, *args): + def test_detect_eof(self): content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) @@ -390,7 +391,7 @@ def test_detect_eof(self, *args): self.assertEqual(eof, fin.tell()) @maybe_mock_gcs - def test_read_gzip(self, *args): + def test_read_gzip(self): expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') buf = io.BytesIO() buf.close = lambda: None # keep buffer open so that we can .getvalue() @@ -419,7 +420,7 @@ def test_read_gzip(self, *args): self.assertEqual(expected, actual) @maybe_mock_gcs - def test_readline(self, *args): + def test_readline(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) @@ -435,22 +436,22 @@ def test_readline(self, *args): self.assertEqual(expected, actual) @maybe_mock_gcs - def test_readline_tiny_buffer(self, *args): + def test_readline_tiny_buffer(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) - with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME, buffering=8) as fin: actual = list(fin) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) @maybe_mock_gcs - def test_read0_does_not_return_data(self, *args): + def test_read0_does_not_return_data(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) - with smart_open.gcs.BufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: data = fin.read(0) self.assertEqual(data, b'') @@ -464,6 +465,7 @@ class BufferedOutputBaseTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() + @maybe_mock_gcs def tearDown(self): cleanup_bucket() @@ -619,6 +621,7 @@ class OpenTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() + @maybe_mock_gcs def tearDown(self): cleanup_bucket() From ca90299216d720192075c947d407d912ec8c6267 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 16:21:47 -0500 Subject: [PATCH 28/84] Fix bug with seek --- smart_open/gcs.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 565505bf..47b2ec8e 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -26,8 +26,6 @@ _UNKNOWN_FILE_SIZE = '*' -_RESUME_INCOMPLETE = 308 - BINARY_NEWLINE = b'\n' SUPPORTED_SCHEMES = ("gcs", "gs") @@ -45,6 +43,7 @@ WHENCE_CHOICES = (START, CURRENT, END) SUCCESSFUL_STATUS_CODES = (200, 201) +_RESUME_INCOMPLETE = 308 def clamp(value, minval, maxval): @@ -133,7 +132,7 @@ def read(self, size=-1): return binary def _download_blob_chunk(self, size): - position = self._position + start = position = self._position if position == self._size: # # When reading, we can't seek to the first byte of an empty file. @@ -141,9 +140,9 @@ def _download_blob_chunk(self, size): # binary = b'' elif size == -1: - binary = self._blob.download_as_string(start=position) + binary = self._blob.download_as_string(start=start) else: - start, end = position, position + size + end = position + size binary = self._blob.download_as_string(start=start, end=end) return binary @@ -229,6 +228,7 @@ def seek(self, offset, whence=START): self._raw_reader.seek(new_position) logger.debug('new_position: %r', self._current_pos) + self._buffer.empty() self._eof = self._current_pos == self._size return self._current_pos @@ -264,7 +264,6 @@ def read(self, size=-1): # # Fill our buffer to the required size. # - # logger.debug('filling %r byte-long buffer up to %r bytes', len(self._buffer), size) self._fill_buffer(size) return self._read_from_buffer(size) @@ -355,7 +354,10 @@ def __init__( self._buf = io.BytesIO() self._session = google_requests.AuthorizedSession(self._credentials) + + # # https://cloud.google.com/storage/docs/json_api/v1/how-tos/resumable-upload#start-resumable + # self._resumeable_upload_url = self._blob.create_resumable_upload_session() # @@ -410,7 +412,9 @@ def write(self, b): return len(b) def terminate(self): + # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#example_cancelling_an_upload + # self._session.delete(self._resumeable_upload_url) # From ca4c26b59293bf3020be6b4a37303dafbc8c19cb Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 16:38:31 -0500 Subject: [PATCH 29/84] Internal and doc stringed constants --- smart_open/gcs.py | 48 ++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 47b2ec8e..5b5a8392 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -14,9 +14,9 @@ logger = logging.getLogger(__name__) -READ_BINARY = 'rb' -WRITE_BINARY = 'wb' -MODES = (READ_BINARY, WRITE_BINARY) +_READ_BINARY = 'rb' +_WRITE_BINARY = 'wb' +_MODES = (_READ_BINARY, _WRITE_BINARY) """Allowed I/O modes for working with GCS.""" _BINARY_TYPES = (six.binary_type, bytearray) @@ -26,23 +26,29 @@ _UNKNOWN_FILE_SIZE = '*' -BINARY_NEWLINE = b'\n' +_BINARY_NEWLINE = b'\n' SUPPORTED_SCHEMES = ("gcs", "gs") +"""Supported schemes for GCS""" -REQUIRED_CHUNK_MULTIPLE = 256 * 1024 +_MIN_MIN_PART_SIZE = _REQUIRED_CHUNK_MULTIPLE = 256 * 1024 """Google requires you to upload in multiples of 256 KB, except for the last part.""" -MIN_MIN_PART_SIZE = 256 * 1024 -"""The absolute minimum permitted by Google.""" +_DEFAULT_MIN_PART_SIZE = 50 * 1024**2 +"""Default minimum part size for GCS multipart uploads""" + DEFAULT_BUFFER_SIZE = 256 * 1024 +"""Default buffer size for working with GCS""" START = 0 +"""Seek to the absolute start of a GCS file""" CURRENT = 1 +"""Seek relative to the current positive of a GCS file""" END = 2 -WHENCE_CHOICES = (START, CURRENT, END) +"""Seek relative to the end of a GCS file""" +_WHENCE_CHOICES = (START, CURRENT, END) -SUCCESSFUL_STATUS_CODES = (200, 201) +_SUCCESSFUL_STATUS_CODES = (200, 201) _RESUME_INCOMPLETE = 308 @@ -68,7 +74,7 @@ def open( blob_id, mode, buffering=DEFAULT_BUFFER_SIZE, - min_part_size=MIN_MIN_PART_SIZE, + min_part_size=_MIN_MIN_PART_SIZE, client=None, # type: google.cloud.storage.Client ): """Open an GCS blob for reading or writing. @@ -89,15 +95,15 @@ def open( The GCS client to use when working with google-cloud-storage. """ - if mode == READ_BINARY: + if mode == _READ_BINARY: return SeekableBufferedInputBase( bucket_id, blob_id, buffering=buffering, - line_terminator=BINARY_NEWLINE, + line_terminator=_BINARY_NEWLINE, client=client, ) - elif mode == WRITE_BINARY: + elif mode == _WRITE_BINARY: return BufferedOutputBase( bucket_id, blob_id, @@ -157,7 +163,7 @@ def __init__( bucket, key, buffering=DEFAULT_BUFFER_SIZE, - line_terminator=BINARY_NEWLINE, + line_terminator=_BINARY_NEWLINE, client=None, # type: google.cloud.storage.Client ): if not client: @@ -214,8 +220,8 @@ def seek(self, offset, whence=START): Returns the position after seeking.""" logger.debug('seeking to offset: %r whence: %r', offset, whence) - if whence not in WHENCE_CHOICES: - raise ValueError('invalid whence, expected one of %r' % WHENCE_CHOICES) + if whence not in _WHENCE_CHOICES: + raise ValueError('invalid whence, expected one of %r' % _WHENCE_CHOICES) if whence == START: new_position = offset @@ -337,7 +343,7 @@ def __init__( self, bucket, blob, - min_part_size=MIN_MIN_PART_SIZE, + min_part_size=_DEFAULT_MIN_PART_SIZE, client=None, # type: google.cloud.storage.Client ): if client is None: @@ -346,7 +352,7 @@ def __init__( self._credentials = self._client._credentials self._bucket = self._client.bucket(bucket) # type: google.cloud.storage.Bucket self._blob = self._bucket.blob(blob) # type: google.cloud.storage.Blob - assert min_part_size % MIN_MIN_PART_SIZE == 0, 'min part size must be a multiple of 256KB' + assert min_part_size % _MIN_MIN_PART_SIZE == 0, 'min part size must be a multiple of 256KB' self._min_part_size = min_part_size self._total_size = 0 @@ -431,8 +437,8 @@ def _upload_next_part(self): end = content_length else: end = _UNKNOWN_FILE_SIZE - if content_length != REQUIRED_CHUNK_MULTIPLE: - stop = content_length // REQUIRED_CHUNK_MULTIPLE * REQUIRED_CHUNK_MULTIPLE - 1 + if content_length != _REQUIRED_CHUNK_MULTIPLE: + stop = content_length // _REQUIRED_CHUNK_MULTIPLE * _REQUIRED_CHUNK_MULTIPLE - 1 self._buf.seek(0) @@ -448,7 +454,7 @@ def _upload_next_part(self): end = content_length headers['Content-Range'] = make_range_string(start, stop, end) response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) - if response.status_code not in SUCCESSFUL_STATUS_CODES: + if response.status_code not in _SUCCESSFUL_STATUS_CODES: logger.error("upload failed with status %s", response.status_code) logger.error("response message: %s", str(response.json())) raise UploadFailedError From c4d159529f273420bf62bb367ffd82c46a1f4f7a Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 16:44:00 -0500 Subject: [PATCH 30/84] Internal functions and doc strings --- smart_open/gcs.py | 26 +++++++++++++++++++------- smart_open/tests/test_gcs.py | 4 ++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 5b5a8392..bef9cb1c 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -24,10 +24,10 @@ if sys.version_info >= (2, 7): _BINARY_TYPES = (six.binary_type, bytearray, memoryview) -_UNKNOWN_FILE_SIZE = '*' - _BINARY_NEWLINE = b'\n' +_UNKNOWN_FILE_SIZE = '*' + SUPPORTED_SCHEMES = ("gcs", "gs") """Supported schemes for GCS""" @@ -53,10 +53,22 @@ def clamp(value, minval, maxval): + """Restrict a value to a range within specified min and max values + + Parameters + ---------- + value: int + The value to clamp + minval: int + The minimum value within the range to clamp to + maxval: int + The maxiumum value within the range to clamp to + + Returns the clamped value""" return max(min(value, maxval), minval) -def make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): +def _make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#step_3upload_the_file_blocks # @@ -65,7 +77,7 @@ def make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): return 'bytes %d-%d/%s' % (start, stop, end) -class UploadFailedError(Exception): +class _UploadFailedError(Exception): """Raised when a multi-part upload to GCS returns a failed response status code.""" @@ -444,7 +456,7 @@ def _upload_next_part(self): headers = { 'Content-Length': str(content_length), - 'Content-Range': make_range_string(start, stop, end) + 'Content-Range': _make_range_string(start, stop, end) } data = self._buf response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) @@ -452,12 +464,12 @@ def _upload_next_part(self): # is a multiple of the min part size if response.status_code == _RESUME_INCOMPLETE: end = content_length - headers['Content-Range'] = make_range_string(start, stop, end) + headers['Content-Range'] = _make_range_string(start, stop, end) response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) if response.status_code not in _SUCCESSFUL_STATUS_CODES: logger.error("upload failed with status %s", response.status_code) logger.error("response message: %s", str(response.json())) - raise UploadFailedError + raise _UploadFailedError logger.debug("upload of part #%i finished" % part_num) self._total_parts += 1 diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 259b8610..cbb5c6d3 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -648,11 +648,11 @@ def test(self): class MakeRangeStringTest(unittest.TestCase): def test_no_stop(self): start, stop, end = 1, None, 2 - self.assertEqual(smart_open.gcs.make_range_string(start, stop), 'bytes 1-/*') + self.assertEqual(smart_open.gcs._make_range_string(start, stop), 'bytes 1-/*') def test_stop(self): start, stop, end = 1, 2, smart_open.gcs._UNKNOWN_FILE_SIZE - self.assertEqual(smart_open.gcs.make_range_string(start, stop), 'bytes 1-2/*') + self.assertEqual(smart_open.gcs._make_range_string(start, stop), 'bytes 1-2/*') if __name__ == '__main__': From 79ee3330b9939b556d0258afbeef8c1f5fe65ecb Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 19:26:19 -0500 Subject: [PATCH 31/84] Fix issue with double issue on upload and add __str__ and __repr__ --- smart_open/gcs.py | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index bef9cb1c..2fbd15e4 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -345,6 +345,22 @@ def _fill_buffer(self, size=-1): logger.debug('reached EOF while filling buffer') self._eof = True + def __str__(self): + return "(%s, %r, %r)" % \ + (self.__class__.__name__, self._bucket.name, self._blob.name) + + def __repr__(self): + return ( + "%s(" + "bucket=%r, " + "blob=%r, " + "buffer_size=%r)" + ) % ( + self._bucket.name, + self._blob.name, + self._buffer_size, + ) + class BufferedOutputBase(io.BufferedIOBase): """Writes bytes to GCS. @@ -445,11 +461,11 @@ def _upload_next_part(self): content_length = self._buf.tell() start = self._total_size - content_length stop = self._total_size - 1 - if content_length != self._min_part_size: + if stop - start + 1 == content_length: end = content_length else: end = _UNKNOWN_FILE_SIZE - if content_length != _REQUIRED_CHUNK_MULTIPLE: + if content_length % _REQUIRED_CHUNK_MULTIPLE != 0: stop = content_length // _REQUIRED_CHUNK_MULTIPLE * _REQUIRED_CHUNK_MULTIPLE - 1 self._buf.seek(0) @@ -460,15 +476,10 @@ def _upload_next_part(self): } data = self._buf response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) - # TODO: Figure out a way to avoid sending another request when the last upload part - # is a multiple of the min part size - if response.status_code == _RESUME_INCOMPLETE: - end = content_length - headers['Content-Range'] = _make_range_string(start, stop, end) - response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) + if response.status_code not in _SUCCESSFUL_STATUS_CODES: logger.error("upload failed with status %s", response.status_code) - logger.error("response message: %s", str(response.json())) + logger.error("response message: %s", response.text) raise _UploadFailedError logger.debug("upload of part #%i finished" % part_num) @@ -481,7 +492,7 @@ def _upload_empty_part(self): 'Content-Length': '0' } response = self._session.put(self._resumeable_upload_url, headers=headers) - assert response.status_code in (200, 201) + assert response.status_code in _SUCCESSFUL_STATUS_CODES self._total_parts += 1 @@ -493,3 +504,19 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.terminate() else: self.close() + + def __str__(self): + return "(%s, %r, %r)" % \ + (self.__class__.__name__, self._bucket.name, self._blob.name) + + def __repr__(self): + return ( + "%s(" + "bucket=%r, " + "blob=%r, " + "min_part_size=%r)" + ) % ( + self._bucket.name, + self._blob.name, + self._min_part_size, + ) \ No newline at end of file From 9622bc66f81cc344bce4019ab5c047c1c1a60985 Mon Sep 17 00:00:00 2001 From: Peter Date: Sun, 5 Jan 2020 19:47:42 -0500 Subject: [PATCH 32/84] Minor cleanup --- smart_open/gcs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 2fbd15e4..5a459021 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -49,7 +49,6 @@ _WHENCE_CHOICES = (START, CURRENT, END) _SUCCESSFUL_STATUS_CODES = (200, 201) -_RESUME_INCOMPLETE = 308 def clamp(value, minval, maxval): @@ -466,7 +465,7 @@ def _upload_next_part(self): else: end = _UNKNOWN_FILE_SIZE if content_length % _REQUIRED_CHUNK_MULTIPLE != 0: - stop = content_length // _REQUIRED_CHUNK_MULTIPLE * _REQUIRED_CHUNK_MULTIPLE - 1 + stop = content_length % _REQUIRED_CHUNK_MULTIPLE - 1 self._buf.seek(0) @@ -519,4 +518,4 @@ def __repr__(self): self._bucket.name, self._blob.name, self._min_part_size, - ) \ No newline at end of file + ) From 780c549d97df4816e6ab8ceec3a234dee197df23 Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 6 Jan 2020 08:23:05 -0500 Subject: [PATCH 33/84] Fix flake8 errors --- integration-tests/test_gcs.py | 6 +++--- smart_open/smart_open_lib.py | 2 +- smart_open/tests/test_gcs.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index c2d6bdd6..67e8e98d 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -30,7 +30,7 @@ def write_read(key, content, write_mode, read_mode, encoding=None, **kwargs): def read_length_prefixed_messages(key, read_mode, encoding=None, **kwargs): with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: actual = b'' - length_byte = fin.read(1); + length_byte = fin.read(1) while len(length_byte): actual += length_byte msg = fin.read(ord(length_byte)) @@ -106,9 +106,9 @@ def test_gcs_performance_small_reads(benchmark): ONE_MIB = 1024**2 one_megabyte_of_msgs = io.BytesIO() - msg = b'\x0f' + b'0123456789abcde' # a length-prefixed "message" + msg = b'\x0f' + b'0123456789abcde' # a length-prefixed "message" for _ in range(0, ONE_MIB, len(msg)): - one_megabyte_of_msgs.write(msg) + one_megabyte_of_msgs.write(msg) one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue() key = _GCS_URL + '/many_reads_performance.bin' diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index b013410e..8f69257b 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -582,7 +582,7 @@ def _open_binary_stream(uri, mode, transport_params): kw = _check_kwargs(smart_open_http.open, transport_params) return smart_open_http.open(uri, mode, **kw), filename elif parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES: - kw =_check_kwargs(smart_open_gcs.open, transport_params) + kw = _check_kwargs(smart_open_gcs.open, transport_params) return smart_open_gcs.open(parsed_uri.bucket_id, parsed_uri.blob_id, mode, **kw), filename else: raise NotImplementedError("scheme %r is not supported", parsed_uri.scheme) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index cbb5c6d3..d1f07b90 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -647,11 +647,11 @@ def test(self): class MakeRangeStringTest(unittest.TestCase): def test_no_stop(self): - start, stop, end = 1, None, 2 + start, stop = 1, None self.assertEqual(smart_open.gcs._make_range_string(start, stop), 'bytes 1-/*') def test_stop(self): - start, stop, end = 1, 2, smart_open.gcs._UNKNOWN_FILE_SIZE + start, stop = 1, 2 self.assertEqual(smart_open.gcs._make_range_string(start, stop), 'bytes 1-2/*') From 43827103adcd1e4d2fe16314a8caf3a97c0f1c9d Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 6 Jan 2020 08:34:20 -0500 Subject: [PATCH 34/84] Additional flake8 resolution --- smart_open/tests/test_gcs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index d1f07b90..f6b4e469 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -26,7 +26,7 @@ def ignore_resource_warnings(): if six.PY2: return - warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") + warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") # noqa class FakeBucket(object): @@ -239,11 +239,11 @@ def inner(*args, **kwargs): with mock.patch( 'google.cloud.storage.Client', return_value=storage_client, - ) as mocked_client, \ + ), \ mock.patch( 'smart_open.gcs.google_requests.AuthorizedSession', return_value=FakeAuthorizedSession(storage_client._credentials), - ) as mocked_session: + ): assert callable(func), 'you didn\'t provide a function!' try: # is it a method that needs a self arg? self_arg = inspect.signature(func).self From 13872de55b0fefdbc045ef09d8c255249ae3e88f Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 6 Jan 2020 08:39:11 -0500 Subject: [PATCH 35/84] Add source code encoding to test file --- smart_open/tests/test_gcs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index f6b4e469..350711eb 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -1,3 +1,10 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2019 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# import gzip import inspect import io @@ -297,7 +304,6 @@ def tearDown(self): @maybe_mock_gcs def test_iter(self): """Are GCS files iterated over correctly?""" - # a list of strings to test with expected = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=expected) From 9b5c802d0dc88aac18382a9c4b4e1e84eeff7240 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 6 Jan 2020 17:21:40 -0500 Subject: [PATCH 36/84] Docstrings in imperative mode --- smart_open/tests/test_gcs.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 350711eb..bad7d388 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -268,20 +268,18 @@ def maybe_mock_gcs(func): @maybe_mock_gcs -def setUpModule(): - '''Called once by unittest when initializing this module. Sets up the +def setUpModule(): # noqa + """Called once by unittest when initializing this module. Sets up the test GCS bucket. - - ''' + """ storage_client.create_bucket(BUCKET_NAME) @maybe_mock_gcs -def tearDownModule(): - '''Called once by unittest when tearing down this module. Empties and +def tearDownModule(): # noqa + """Called once by unittest when tearing down this module. Empties and removes the test GCS bucket. - - ''' + """ try: bucket = get_bucket() bucket.delete() From daae9fe86b337a21583fa6490dd3748c1c6abe55 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 6 Jan 2020 17:23:52 -0500 Subject: [PATCH 37/84] Test grammar --- smart_open/tests/test_gcs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index bad7d388..c7429bc0 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -269,7 +269,7 @@ def maybe_mock_gcs(func): @maybe_mock_gcs def setUpModule(): # noqa - """Called once by unittest when initializing this module. Sets up the + """Called once by unittest when initializing this module. Set up the test GCS bucket. """ storage_client.create_bucket(BUCKET_NAME) @@ -277,7 +277,7 @@ def setUpModule(): # noqa @maybe_mock_gcs def tearDownModule(): # noqa - """Called once by unittest when tearing down this module. Empties and + """Called once by unittest when tearing down this module. Empty and removes the test GCS bucket. """ try: From 86c24db85db53ee6b691939daec3150897f74563 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 6 Jan 2020 17:24:41 -0500 Subject: [PATCH 38/84] Fix mock_gcs docstring --- smart_open/tests/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index c7429bc0..733399f7 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -241,7 +241,7 @@ def put_to_bucket(contents, num_attempts=12, sleep_time=5): def mock_gcs(func): - '''Mocks the function and provides additional required arguments.''' + """Mock the function and provide additional required arguments.""" def inner(*args, **kwargs): with mock.patch( 'google.cloud.storage.Client', From f4c478192a89601fc021d458d13203d37b1808f9 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 6 Jan 2020 17:30:33 -0500 Subject: [PATCH 39/84] Only support gs scheme --- smart_open/gcs.py | 4 ++-- smart_open/smart_open_lib.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 5a459021..8b30b7d2 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -28,8 +28,8 @@ _UNKNOWN_FILE_SIZE = '*' -SUPPORTED_SCHEMES = ("gcs", "gs") -"""Supported schemes for GCS""" +SUPPORTED_SCHEME = "gs" +"""Supported scheme for GCS""" _MIN_MIN_PART_SIZE = _REQUIRED_CHUNK_MULTIPLE = 256 * 1024 """Google requires you to upload in multiples of 256 KB, except for the last part.""" diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 8f69257b..5f7058a5 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -581,7 +581,7 @@ def _open_binary_stream(uri, mode, transport_params): filename = P.basename(urlparse.urlparse(uri).path) kw = _check_kwargs(smart_open_http.open, transport_params) return smart_open_http.open(uri, mode, **kw), filename - elif parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES: + elif parsed_uri.scheme == smart_open_gcs.SUPPORTED_SCHEME: kw = _check_kwargs(smart_open_gcs.open, transport_params) return smart_open_gcs.open(parsed_uri.bucket_id, parsed_uri.blob_id, mode, **kw), filename else: @@ -693,7 +693,6 @@ def _parse_uri(uri_as_string): Supported URI schemes are: * file - * gcs * gs * hdfs * http @@ -705,7 +704,6 @@ def _parse_uri(uri_as_string): * webhdfs .s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. - .gs and gcs are treated the same way. Valid URI examples:: From e03e5fe3dc6ba8b746f48ab1103ce4ae3c8c8dad Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Mon, 6 Jan 2020 17:43:52 -0500 Subject: [PATCH 40/84] Remove additional occurences of removed gcs scheme --- smart_open/smart_open_lib.py | 4 ++-- smart_open/tests/test_gcs.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 5f7058a5..782bc5d4 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -746,7 +746,7 @@ def _parse_uri(uri_as_string): return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string) elif parsed_uri.scheme in smart_open_ssh.SCHEMES: return _parse_uri_ssh(parsed_uri) - elif parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES: + elif parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEME: return _parse_uri_gcs(parsed_uri) else: raise NotImplementedError( @@ -845,7 +845,7 @@ def _unquote(text): def _parse_uri_gcs(parsed_uri): - assert parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEMES + assert parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEME bucket_id, blob_id = parsed_uri.netloc, parsed_uri.path[1:] return Uri(scheme=parsed_uri.scheme, bucket_id=bucket_id, blob_id=blob_id) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 733399f7..69535529 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -526,7 +526,7 @@ def test_write_03(self): self.assertEqual(fout._total_parts, 1) # read back the same key and check its content - output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) self.assertEqual(output, ["t" * 262142 + '\n', "t"]) @maybe_mock_gcs @@ -537,7 +537,7 @@ def test_write_04(self): pass # read back the same key and check its content - output = list(smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) + output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) self.assertEqual(output, []) @@ -566,7 +566,7 @@ def test_buffered_writer_wrapper_works(self): with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) - with smart_open.open("gcs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), 'rb') as fin: + with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), 'rb') as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() From 7321bbd97cf384315f230c4348df07f8a482b75e Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 6 Jan 2020 18:15:51 -0500 Subject: [PATCH 41/84] Add test_read_past_end --- smart_open/tests/test_gcs.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 69535529..afa2c68b 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -460,6 +460,16 @@ def test_read0_does_not_return_data(self): self.assertEqual(data, b'') + @maybe_mock_gcs + def test_read_past_end(self): + content = b'englishman\nin\nnew\nyork\n' + put_to_bucket(contents=content) + + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) as fin: + data = fin.read(100) + + self.assertEqual(data, content) + class BufferedOutputBaseTest(unittest.TestCase): """ From 5d1baf8c53dae38d027f4c0129fa83c384f22726 Mon Sep 17 00:00:00 2001 From: Peter Date: Tue, 7 Jan 2020 08:38:04 -0500 Subject: [PATCH 42/84] Clean up tests with class level decorator --- smart_open/tests/test_gcs.py | 61 +++++++++++++++--------------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index afa2c68b..57a1c2f1 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -190,10 +190,8 @@ def put(self, url, data=None, headers=None): self._credentials.client._uploads[url].write(data.read()) return FakeResponse() - def _blob_with_url(self, - url, - client # type: FakeClient - ): + def _blob_with_url(self, url, client): + # type: (str, FakeClient) -> None return client._uploads.get(url) @@ -240,7 +238,27 @@ def put_to_bucket(contents, num_attempts=12, sleep_time=5): assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) -def mock_gcs(func): +def for_all_methods(decorator): + def decorate(cls): + for attr in cls.__dict__: # there's propably a better way to do this + if callable(getattr(cls, attr)): + setattr(cls, attr, decorator(getattr(cls, attr))) + return cls + return decorate + + +def mock_gcs(class_or_func): + """Mock all methods of a class or a function.""" + if inspect.isclass(class_or_func): + for attr in class_or_func.__dict__: + if callable(getattr(class_or_func, attr)): + setattr(class_or_func, attr, mock_gcs_func(getattr(class_or_func, attr))) + return class_or_func + else: + return mock_gcs_func(class_or_func) + + +def mock_gcs_func(func): """Mock the function and provide additional required arguments.""" def inner(*args, **kwargs): with mock.patch( @@ -287,6 +305,7 @@ def tearDownModule(): # noqa pass +@maybe_mock_gcs class SeekableBufferedInputBaseTest(unittest.TestCase): def setUp(self): # lower the multipart upload size, to speed up these tests @@ -295,11 +314,9 @@ def setUp(self): ignore_resource_warnings() - @maybe_mock_gcs def tearDown(self): cleanup_bucket() - @maybe_mock_gcs def test_iter(self): """Are GCS files iterated over correctly?""" expected = u"hello wořld\nhow are you?".encode('utf8') @@ -310,7 +327,6 @@ def test_iter(self): output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) - @maybe_mock_gcs def test_iter_context_manager(self): # same thing but using a context manager expected = u"hello wořld\nhow are you?".encode('utf8') @@ -319,7 +335,6 @@ def test_iter_context_manager(self): output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) - @maybe_mock_gcs def test_read(self): """Are GCS files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') @@ -331,7 +346,6 @@ def test_read(self): self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(content[14:], fin.read()) # read the rest - @maybe_mock_gcs def test_seek_beginning(self): """Does seeking to the beginning of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') @@ -347,7 +361,6 @@ def test_seek_beginning(self): fin.seek(0) self.assertEqual(content, fin.read(-1)) # same thing - @maybe_mock_gcs def test_seek_start(self): """Does seeking from the start of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') @@ -359,7 +372,6 @@ def test_seek_start(self): self.assertEqual(fin.tell(), 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) - @maybe_mock_gcs def test_seek_current(self): """Does seeking from the middle of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') @@ -371,7 +383,6 @@ def test_seek_current(self): self.assertEqual(seek, 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) - @maybe_mock_gcs def test_seek_end(self): """Does seeking from the end of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') @@ -382,7 +393,6 @@ def test_seek_end(self): self.assertEqual(seek, len(content) - 4) self.assertEqual(fin.read(), b'you?') - @maybe_mock_gcs def test_detect_eof(self): content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) @@ -394,7 +404,6 @@ def test_detect_eof(self): fin.seek(0, whence=smart_open.gcs.END) self.assertEqual(eof, fin.tell()) - @maybe_mock_gcs def test_read_gzip(self): expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') buf = io.BytesIO() @@ -423,7 +432,6 @@ def test_read_gzip(self): self.assertEqual(expected, actual) - @maybe_mock_gcs def test_readline(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) @@ -439,7 +447,6 @@ def test_readline(self): expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) - @maybe_mock_gcs def test_readline_tiny_buffer(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) @@ -450,7 +457,6 @@ def test_readline_tiny_buffer(self): expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) - @maybe_mock_gcs def test_read0_does_not_return_data(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) @@ -460,7 +466,6 @@ def test_read0_does_not_return_data(self): self.assertEqual(data, b'') - @maybe_mock_gcs def test_read_past_end(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) @@ -471,6 +476,7 @@ def test_read_past_end(self): self.assertEqual(data, content) +@maybe_mock_gcs class BufferedOutputBaseTest(unittest.TestCase): """ Test writing into GCS files. @@ -479,11 +485,9 @@ class BufferedOutputBaseTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() - @maybe_mock_gcs def tearDown(self): cleanup_bucket() - @maybe_mock_gcs def test_write_01(self): """Does writing into GCS work correctly?""" test_string = u"žluťoučký koníček".encode('utf8') @@ -495,7 +499,6 @@ def test_write_01(self): self.assertEqual(output, [test_string]) - @maybe_mock_gcs def test_write_01a(self): """Does gcs write fail on incorrect input?""" try: @@ -506,7 +509,6 @@ def test_write_01a(self): else: self.fail() - @maybe_mock_gcs def test_write_02(self): """Does gcs write unicode-utf8 conversion work?""" smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) @@ -516,7 +518,6 @@ def test_write_02(self): fout.write(u"testžížáč".encode("utf-8")) self.assertEqual(fout.tell(), 14) - @maybe_mock_gcs def test_write_03(self): """Does gcs multipart chunking work correctly?""" # write @@ -539,7 +540,6 @@ def test_write_03(self): output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) self.assertEqual(output, ["t" * 262142 + '\n', "t"]) - @maybe_mock_gcs def test_write_04(self): """Does writing no data cause key with an empty value to be created?""" smart_open_write = smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) @@ -551,7 +551,6 @@ def test_write_04(self): self.assertEqual(output, []) - @maybe_mock_gcs def test_gzip(self): expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: @@ -564,7 +563,6 @@ def test_gzip(self): self.assertEqual(expected, actual) - @maybe_mock_gcs def test_buffered_writer_wrapper_works(self): """ Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which @@ -582,7 +580,6 @@ def test_buffered_writer_wrapper_works(self): self.assertEqual(expected, actual) - @maybe_mock_gcs def test_binary_iterator(self): expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') put_to_bucket(contents=b"\n".join(expected)) @@ -590,20 +587,17 @@ def test_binary_iterator(self): actual = [line.rstrip() for line in fin] self.assertEqual(expected, actual) - @maybe_mock_gcs def test_nonexisting_bucket(self): expected = u"выйду ночью в поле с конём".encode('utf-8') with self.assertRaises(google.api_core.exceptions.NotFound): with smart_open.gcs.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: fout.write(expected) - @maybe_mock_gcs def test_read_nonexisting_key(self): with self.assertRaises(google.api_core.exceptions.NotFound): with smart_open.gcs.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: fin.read() - @maybe_mock_gcs def test_double_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') @@ -611,7 +605,6 @@ def test_double_close(self): fout.close() fout.close() - @maybe_mock_gcs def test_flush_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') @@ -619,7 +612,6 @@ def test_flush_close(self): fout.flush() fout.close() - @maybe_mock_gcs def test_terminate(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') @@ -631,15 +623,14 @@ def test_terminate(self): fin.read() +@maybe_mock_gcs class OpenTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() - @maybe_mock_gcs def tearDown(self): cleanup_bucket() - @maybe_mock_gcs def test_read_never_returns_none(self): """read should never return None.""" test_string = u"ветер по морю гуляет..." From 56112cd2372144c0627c349a8c5c7f536c73dd91 Mon Sep 17 00:00:00 2001 From: Peter Date: Tue, 7 Jan 2020 08:39:45 -0500 Subject: [PATCH 43/84] Remove stub function --- smart_open/tests/test_gcs.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 57a1c2f1..06d4db4e 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -238,15 +238,6 @@ def put_to_bucket(contents, num_attempts=12, sleep_time=5): assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) -def for_all_methods(decorator): - def decorate(cls): - for attr in cls.__dict__: # there's propably a better way to do this - if callable(getattr(cls, attr)): - setattr(cls, attr, decorator(getattr(cls, attr))) - return cls - return decorate - - def mock_gcs(class_or_func): """Mock all methods of a class or a function.""" if inspect.isclass(class_or_func): From f4ad80b0aa36d8b52071f7be68ee350c4795d703 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Tue, 7 Jan 2020 09:31:24 -0500 Subject: [PATCH 44/84] Use equality instead of in for scheme --- smart_open/smart_open_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 782bc5d4..037a07f1 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -746,7 +746,7 @@ def _parse_uri(uri_as_string): return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string) elif parsed_uri.scheme in smart_open_ssh.SCHEMES: return _parse_uri_ssh(parsed_uri) - elif parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEME: + elif parsed_uri.scheme == smart_open_gcs.SUPPORTED_SCHEME: return _parse_uri_gcs(parsed_uri) else: raise NotImplementedError( @@ -845,7 +845,7 @@ def _unquote(text): def _parse_uri_gcs(parsed_uri): - assert parsed_uri.scheme in smart_open_gcs.SUPPORTED_SCHEME + assert parsed_uri.scheme == smart_open_gcs.SUPPORTED_SCHEME bucket_id, blob_id = parsed_uri.netloc, parsed_uri.path[1:] return Uri(scheme=parsed_uri.scheme, bucket_id=bucket_id, blob_id=blob_id) From 8611dbae4c5d6d0bedd93592f709d77a0010b67b Mon Sep 17 00:00:00 2001 From: Peter Date: Tue, 7 Jan 2020 17:24:38 -0500 Subject: [PATCH 45/84] Fix repr and mock import --- smart_open/gcs.py | 2 ++ smart_open/tests/test_gcs.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 8b30b7d2..e3918f68 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -355,6 +355,7 @@ def __repr__(self): "blob=%r, " "buffer_size=%r)" ) % ( + self.__class__.__name__, self._bucket.name, self._blob.name, self._buffer_size, @@ -515,6 +516,7 @@ def __repr__(self): "blob=%r, " "min_part_size=%r)" ) % ( + self.__class__.__name__, self._bucket.name, self._blob.name, self._min_part_size, diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 06d4db4e..bb5a6cad 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -13,7 +13,10 @@ import time import uuid import unittest -from unittest import mock +try: + from unittest import mock +except: + import mock import warnings import google.cloud From 03ac2a9890ae1bd750a36c5570d5ed4e5947ddce Mon Sep 17 00:00:00 2001 From: Peter Date: Tue, 7 Jan 2020 17:25:34 -0500 Subject: [PATCH 46/84] Specify ImportError --- smart_open/tests/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index bb5a6cad..65364dd9 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -15,7 +15,7 @@ import unittest try: from unittest import mock -except: +except ImportError: import mock import warnings From 93e4b0f047ec9da7dd71cd4c4da2e9880fa1fd2e Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 08:33:47 -0500 Subject: [PATCH 47/84] Use BytesIO in integration test --- integration-tests/test_gcs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index 67e8e98d..00d10864 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -29,14 +29,14 @@ def write_read(key, content, write_mode, read_mode, encoding=None, **kwargs): def read_length_prefixed_messages(key, read_mode, encoding=None, **kwargs): with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: - actual = b'' + result = io.BytesIO() length_byte = fin.read(1) while len(length_byte): - actual += length_byte + result.write(length_byte) msg = fin.read(ord(length_byte)) - actual += msg + result.write(msg) length_byte = fin.read(1) - return actual + return result.getvalue() def test_gcs_readwrite_text(benchmark): From 2bcf8e3ae21577fdfbbb63db94e051d67db95700 Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 08:36:04 -0500 Subject: [PATCH 48/84] Explicit encoding in integration tests --- integration-tests/test_gcs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index 00d10864..ee674bc4 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -44,7 +44,7 @@ def test_gcs_readwrite_text(benchmark): key = _GCS_URL + '/sanity.txt' text = 'с гранатою в кармане, с чекою в руке' - actual = benchmark(write_read, key, text, 'w', 'r', 'utf-8') + actual = benchmark(write_read, key, text, 'w', 'r', encoding='utf-8') assert actual == text @@ -53,7 +53,7 @@ def test_gcs_readwrite_text_gzip(benchmark): key = _GCS_URL + '/sanity.txt.gz' text = 'не чайки здесь запели на знакомом языке' - actual = benchmark(write_read, key, text, 'w', 'r', 'utf-8') + actual = benchmark(write_read, key, text, 'w', 'r', encoding='utf-8') assert actual == text From 0a48ba43a4a141af5713596ad27b5b6deb4e3a18 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 08:39:09 -0500 Subject: [PATCH 49/84] Remove unneeded variable in integration test Co-Authored-By: Michael Penkov --- integration-tests/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index ee674bc4..e1816477 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -23,7 +23,7 @@ def write_read(key, content, write_mode, read_mode, encoding=None, **kwargs): with smart_open.open(key, write_mode, encoding=encoding, **kwargs) as fout: fout.write(content) with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: - actual = fin.read() + return fin.read() return actual From 18ca5f807a5de5daaa6ca60c1b6128ec99222021 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 08:39:48 -0500 Subject: [PATCH 50/84] Remove unneeded data variable Co-Authored-By: Michael Penkov --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index e3918f68..d4062021 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -475,7 +475,7 @@ def _upload_next_part(self): 'Content-Range': _make_range_string(start, stop, end) } data = self._buf - response = self._session.put(self._resumeable_upload_url, data=data, headers=headers) + response = self._session.put(self._resumeable_upload_url, data=self._buf, headers=headers) if response.status_code not in _SUCCESSFUL_STATUS_CODES: logger.error("upload failed with status %s", response.status_code) From ffa8083762e826244be5a7571dec6636a6cef9d1 Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 08:42:48 -0500 Subject: [PATCH 51/84] Remove .format from test_gcs --- smart_open/tests/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 65364dd9..e7370175 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -90,7 +90,7 @@ def __init__(self, name, bucket, create=True): self._create_if_not_exists() def create_resumable_upload_session(self): - resumeable_upload_url = self.RESUMABLE_SESSION_URI_TEMPLATE.format( + resumeable_upload_url = self.RESUMABLE_SESSION_URI_TEMPLATE % dict( bucket=self._bucket.name, upload_id=uuid.uuid4().hex, ) From 9579b1dbb7875bbc8eebfa1fd42af49cbe756aba Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 08:44:17 -0500 Subject: [PATCH 52/84] Move RESUMEABLE_SESSION_URI_TEMPLATE to module scope --- smart_open/tests/test_gcs.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index e7370175..23e88833 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -30,6 +30,13 @@ WRITE_BLOB_NAME = 'test-write-blob' DISABLE_MOCKS = os.environ.get('SO_DISABLE_MOCKS') == "1" +RESUMABLE_SESSION_URI_TEMPLATE = ( + 'https://www.googleapis.com/upload/storage/v1/b/' + '{bucket}' + '/o?uploadType=resumable&upload_id=' + '{upload_id}' +) + logger = logging.getLogger(__name__) @@ -73,13 +80,6 @@ def _delete_blob(self, blob): class FakeBlob(object): - RESUMABLE_SESSION_URI_TEMPLATE = ( - 'https://www.googleapis.com/upload/storage/v1/b/' - '{bucket}' - '/o?uploadType=resumable&upload_id=' - '{upload_id}' - ) - def __init__(self, name, bucket, create=True): self.name = name self._bucket = bucket # type: FakeBucket @@ -90,7 +90,7 @@ def __init__(self, name, bucket, create=True): self._create_if_not_exists() def create_resumable_upload_session(self): - resumeable_upload_url = self.RESUMABLE_SESSION_URI_TEMPLATE % dict( + resumeable_upload_url = RESUMABLE_SESSION_URI_TEMPLATE % dict( bucket=self._bucket.name, upload_id=uuid.uuid4().hex, ) From aa90d64f8d27e9517094592483fbfa8c37f5eabe Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 10:53:25 -0500 Subject: [PATCH 53/84] Remove unnecessary explicit encoding Co-Authored-By: Michael Penkov --- integration-tests/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index e1816477..140875e7 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -19,7 +19,7 @@ def initialize_bucket(): blob.delete() -def write_read(key, content, write_mode, read_mode, encoding=None, **kwargs): +def write_read(key, content, write_mode, read_mode, **kwargs): with smart_open.open(key, write_mode, encoding=encoding, **kwargs) as fout: fout.write(content) with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: From adf5d56f82fd3f2a08f39200be0a5dae6c5a079e Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 10:55:05 -0500 Subject: [PATCH 54/84] Remove unnecessary variable in read Co-Authored-By: Michael Penkov --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index d4062021..355df60b 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -264,7 +264,7 @@ def read(self, size=-1): elif size < 0: from_buf = self._read_from_buffer() self._current_pos = self._size - return from_buf + self._raw_reader.read() + return self._read_from_buffer() + self._raw_reader.read() # # Return unused data first From 03a7d728a38975bbb7915e90c2fb1475ba101c33 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 14:44:45 -0500 Subject: [PATCH 55/84] Change assertion to use _REQUIRED_CHUNK_MULTIPLE Co-Authored-By: Michael Penkov --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 355df60b..4279c0b3 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -380,7 +380,7 @@ def __init__( self._credentials = self._client._credentials self._bucket = self._client.bucket(bucket) # type: google.cloud.storage.Bucket self._blob = self._bucket.blob(blob) # type: google.cloud.storage.Blob - assert min_part_size % _MIN_MIN_PART_SIZE == 0, 'min part size must be a multiple of 256KB' + assert min_part_size % _REQUIRED_CHUNK_MULTIPLE == 0, 'min part size must be a multiple of 256KB' self._min_part_size = min_part_size self._total_size = 0 From 115940a05d673472f73f9c88d201149cff74dabf Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 15:51:05 -0500 Subject: [PATCH 56/84] Import clamp from s3 --- integration-tests/test_gcs.py | 9 ++++----- smart_open/gcs.py | 21 ++------------------- smart_open/tests/test_gcs.py | 7 ------- 3 files changed, 6 insertions(+), 31 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index 140875e7..9fba98df 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -20,15 +20,14 @@ def initialize_bucket(): def write_read(key, content, write_mode, read_mode, **kwargs): - with smart_open.open(key, write_mode, encoding=encoding, **kwargs) as fout: + with smart_open.open(key, write_mode, **kwargs) as fout: fout.write(content) - with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: + with smart_open.open(key, read_mode, **kwargs) as fin: return fin.read() - return actual -def read_length_prefixed_messages(key, read_mode, encoding=None, **kwargs): - with smart_open.open(key, read_mode, encoding=encoding, **kwargs) as fin: +def read_length_prefixed_messages(key, read_mode, **kwargs): + with smart_open.open(key, read_mode, **kwargs) as fin: result = io.BytesIO() length_byte = fin.read(1) while len(length_byte): diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 4279c0b3..9aa2cab8 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -11,6 +11,7 @@ import six import smart_open.bytebuffer +import smart_open.s3 logger = logging.getLogger(__name__) @@ -51,22 +52,6 @@ _SUCCESSFUL_STATUS_CODES = (200, 201) -def clamp(value, minval, maxval): - """Restrict a value to a range within specified min and max values - - Parameters - ---------- - value: int - The value to clamp - minval: int - The minimum value within the range to clamp to - maxval: int - The maxiumum value within the range to clamp to - - Returns the clamped value""" - return max(min(value, maxval), minval) - - def _make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#step_3upload_the_file_blocks @@ -240,7 +225,7 @@ def seek(self, offset, whence=START): new_position = self._current_pos + offset else: new_position = self._size + offset - new_position = clamp(new_position, 0, self._size) + new_position = smart_open.s3.clamp(new_position, 0, self._size) self._current_pos = new_position self._raw_reader.seek(new_position) logger.debug('new_position: %r', self._current_pos) @@ -262,7 +247,6 @@ def read(self, size=-1): if size == 0: return b'' elif size < 0: - from_buf = self._read_from_buffer() self._current_pos = self._size return self._read_from_buffer() + self._raw_reader.read() @@ -474,7 +458,6 @@ def _upload_next_part(self): 'Content-Length': str(content_length), 'Content-Range': _make_range_string(start, stop, end) } - data = self._buf response = self._session.put(self._resumeable_upload_url, data=self._buf, headers=headers) if response.status_code not in _SUCCESSFUL_STATUS_CODES: diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 23e88833..0778eca8 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -637,13 +637,6 @@ def test_read_never_returns_none(self): self.assertEqual(r.read(), b"") -class ClampTest(unittest.TestCase): - def test(self): - self.assertEqual(smart_open.gcs.clamp(5, 0, 10), 5) - self.assertEqual(smart_open.gcs.clamp(11, 0, 10), 10) - self.assertEqual(smart_open.gcs.clamp(-1, 0, 10), 0) - - class MakeRangeStringTest(unittest.TestCase): def test_no_stop(self): start, stop = 1, None From b735b7e04d503730f62ec90bb9b7a66022b381f8 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:00:57 -0500 Subject: [PATCH 57/84] Add comment on buffering being read-only --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 9aa2cab8..08f4cd75 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -84,7 +84,7 @@ def open( mode: str The mode for opening the object. Must be either "rb" or "wb". buffering: int, optional - The buffer size to use when performing I/O. + The buffer size to use when performing I/O. For reading only. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. client: google.cloud.google.cloud.storage.Client, optional From 1d4805ca7d3f6d9afa0829c01a410ab03a2f9381 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:01:19 -0500 Subject: [PATCH 58/84] Fix client docstring --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 08f4cd75..c53d825b 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -87,7 +87,7 @@ def open( The buffer size to use when performing I/O. For reading only. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. - client: google.cloud.google.cloud.storage.Client, optional + client: google.cloud.storage.Client, optional The GCS client to use when working with google-cloud-storage. """ From 0b5797faa54627663a27b7b0a4a7ea27de73c33e Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:02:57 -0500 Subject: [PATCH 59/84] Make SeekableRawReader internal --- smart_open/gcs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index c53d825b..6dc12255 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -110,7 +110,7 @@ def open( raise NotImplementedError('GCS support for mode %r not implemented' % mode) -class SeekableRawReader(object): +class _SeekableRawReader(object): """Read an GCS object.""" def __init__(self, gcs_blob, size): @@ -171,7 +171,7 @@ def __init__( raise google.cloud.exceptions.NotFound('blob {} not found in {}'.format(key, bucket)) self._size = self._blob.size if self._blob.size is not None else 0 - self._raw_reader = SeekableRawReader(self._blob, self._size) + self._raw_reader = _SeekableRawReader(self._blob, self._size) self._current_pos = 0 self._buffer_size = buffering self._buffer = smart_open.bytebuffer.ByteBuffer(buffering) From 6dd7e761f486bcd69f3dce7823d7520b31de1cfc Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:07:40 -0500 Subject: [PATCH 60/84] Add return value to seek in _SeekableRawReader --- smart_open/gcs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 6dc12255..b7d7bf93 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -123,8 +123,11 @@ def seek(self, position): """Seek to the specified position (byte offset) in the GCS key. :param int position: The byte offset from the beginning of the key. + + Returns the position after seeking. """ self._position = position + return self._position def read(self, size=-1): if self._position >= self._size: From 94ce2bdc1f736061c39b52dbddfb19bdb4fdc851 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:35:07 -0500 Subject: [PATCH 61/84] Allow integration test to take a prefix --- integration-tests/test_gcs.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index 9fba98df..a8a49988 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -1,22 +1,16 @@ # -*- coding: utf-8 -*- import io import os - -import google.cloud.storage +import subprocess import smart_open -_GCS_BUCKET = os.environ.get('SO_GCS_BUCKET') -_GCS_URL = 'gs://' + _GCS_BUCKET -assert _GCS_BUCKET is not None, 'please set the SO_GCS_BUCKET environment variable' +_GCS_URL = os.environ.get('SO_GCS_URL') +assert _GCS_URL is not None, 'please set the SO_GCS_URL environment variable' def initialize_bucket(): - storage_client = google.cloud.storage.Client() - bucket = storage_client.get_bucket(_GCS_BUCKET) - blobs = bucket.list_blobs() - for blob in blobs: - blob.delete() + subprocess.check_call(['gsutil' 'rm', '-r', _GCS_URL]) def write_read(key, content, write_mode, read_mode, **kwargs): From d2a4c2e3439b353d18698b204a795313e2b46843 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:38:45 -0500 Subject: [PATCH 62/84] Add doc to NotFound exception --- smart_open/gcs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index b7d7bf93..0c86c8a2 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -155,8 +155,11 @@ def _download_blob_chunk(self, size): class SeekableBufferedInputBase(io.BufferedIOBase): """Reads bytes from GCS. - Implements the io.BufferedIOBase interface of the standard library.""" + Implements the io.BufferedIOBase interface of the standard library. + + :raises google.cloud.exceptions.NotFound: Raised when the blob to read from does not exist. + """ def __init__( self, bucket, From 674ad80795e19c5a9902fe9e90f054baa0a55c9a Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:39:22 -0500 Subject: [PATCH 63/84] Fix misleading log statement --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 0c86c8a2..16a42792 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -234,7 +234,7 @@ def seek(self, offset, whence=START): new_position = smart_open.s3.clamp(new_position, 0, self._size) self._current_pos = new_position self._raw_reader.seek(new_position) - logger.debug('new_position: %r', self._current_pos) + logger.debug('current_pos: %r', self._current_pos) self._buffer.empty() self._eof = self._current_pos == self._size From 910bddea5c322f75d8f5085cd35a55bcd0715c2e Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 16:59:03 -0500 Subject: [PATCH 64/84] Various formatting changes --- smart_open/gcs.py | 67 +++++++++++++++++------------------- smart_open/tests/test_gcs.py | 6 ++-- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 16a42792..196a09d0 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -179,8 +179,8 @@ def __init__( self._raw_reader = _SeekableRawReader(self._blob, self._size) self._current_pos = 0 - self._buffer_size = buffering - self._buffer = smart_open.bytebuffer.ByteBuffer(buffering) + self._current_part_size = buffering + self._current_part = smart_open.bytebuffer.ByteBuffer(buffering) self._eof = False self._line_terminator = line_terminator @@ -236,7 +236,7 @@ def seek(self, offset, whence=START): self._raw_reader.seek(new_position) logger.debug('current_pos: %r', self._current_pos) - self._buffer.empty() + self._current_part.empty() self._eof = self._current_pos == self._size return self._current_pos @@ -259,7 +259,7 @@ def read(self, size=-1): # # Return unused data first # - if len(self._buffer) >= size: + if len(self._current_part) >= size: return self._read_from_buffer(size) # @@ -292,15 +292,15 @@ def readline(self, limit=-1): if limit != -1: raise NotImplementedError('limits other than -1 not implemented yet') the_line = io.BytesIO() - while not (self._eof and len(self._buffer) == 0): + while not (self._eof and len(self._current_part) == 0): # - # In the worst case, we're reading the unread part of self._buffer + # In the worst case, we're reading the unread part of self._current_part # twice here, once in the if condition and once when calling index. # # This is sub-optimal, but better than the alternative: wrapping # .index in a try..except, because that is slower. # - remaining_buffer = self._buffer.peek() + remaining_buffer = self._current_part.peek() if self._line_terminator in remaining_buffer: next_newline = remaining_buffer.index(self._line_terminator) the_line.write(self._read_from_buffer(next_newline + 1)) @@ -319,24 +319,23 @@ def terminate(self): # def _read_from_buffer(self, size=-1): """Remove at most size bytes from our buffer and return them.""" - # logger.debug('reading %r bytes from %r byte-long buffer', size, len(self._buffer)) - size = size if size >= 0 else len(self._buffer) - part = self._buffer.read(size) + # logger.debug('reading %r bytes from %r byte-long buffer', size, len(self._current_part)) + size = size if size >= 0 else len(self._current_part) + part = self._current_part.read(size) self._current_pos += len(part) # logger.debug('part: %r', part) return part def _fill_buffer(self, size=-1): - size = size if size >= 0 else self._buffer._chunk_size - while len(self._buffer) < size and not self._eof: - bytes_read = self._buffer.fill(self._raw_reader) + size = size if size >= 0 else self._current_part._chunk_size + while len(self._current_part) < size and not self._eof: + bytes_read = self._current_part.fill(self._raw_reader) if bytes_read == 0: logger.debug('reached EOF while filling buffer') self._eof = True def __str__(self): - return "(%s, %r, %r)" % \ - (self.__class__.__name__, self._bucket.name, self._blob.name) + return "(%s, %r, %r)" % (self.__class__.__name__, self._bucket.name, self._blob.name) def __repr__(self): return ( @@ -348,7 +347,7 @@ def __repr__(self): self.__class__.__name__, self._bucket.name, self._blob.name, - self._buffer_size, + self._current_part_size, ) @@ -375,14 +374,14 @@ def __init__( self._total_size = 0 self._total_parts = 0 - self._buf = io.BytesIO() + self._current_part = io.BytesIO() self._session = google_requests.AuthorizedSession(self._credentials) # # https://cloud.google.com/storage/docs/json_api/v1/how-tos/resumable-upload#start-resumable # - self._resumeable_upload_url = self._blob.create_resumable_upload_session() + self._resumable_upload_url = self._blob.create_resumable_upload_session() # # This member is part of the io.BufferedIOBase interface. @@ -399,7 +398,7 @@ def close(self): logger.debug("closing") if self._total_size == 0: # empty files self._upload_empty_part() - if self._buf.tell(): + if self._current_part.tell(): self._upload_next_part() logger.debug("successfully closed") @@ -424,13 +423,12 @@ def write(self, b): do any HTTP transfer right away.""" if not isinstance(b, _BINARY_TYPES): - raise TypeError( - "input must be one of %r, got: %r" % (_BINARY_TYPES, type(b))) + raise TypeError("input must be one of %r, got: %r" % (_BINARY_TYPES, type(b))) - self._buf.write(b) + self._current_part.write(b) self._total_size += len(b) - if self._buf.tell() >= self._min_part_size: + if self._current_part.tell() >= self._min_part_size: self._upload_next_part() return len(b) @@ -439,7 +437,7 @@ def terminate(self): # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#example_cancelling_an_upload # - self._session.delete(self._resumeable_upload_url) + self._session.delete(self._resumable_upload_url) # # Internal methods. @@ -447,8 +445,10 @@ def terminate(self): def _upload_next_part(self): part_num = self._total_parts + 1 logger.info("uploading part #%i, %i bytes (total %.3fGB)", - part_num, self._buf.tell(), self._total_size / 1024.0 ** 3) - content_length = self._buf.tell() + part_num, + self._current_part.tell(), + self._total_size / 1024.0 ** 3) + content_length = self._current_part.tell() start = self._total_size - content_length stop = self._total_size - 1 if stop - start + 1 == content_length: @@ -458,13 +458,13 @@ def _upload_next_part(self): if content_length % _REQUIRED_CHUNK_MULTIPLE != 0: stop = content_length % _REQUIRED_CHUNK_MULTIPLE - 1 - self._buf.seek(0) + self._current_part.seek(0) headers = { 'Content-Length': str(content_length), 'Content-Range': _make_range_string(start, stop, end) } - response = self._session.put(self._resumeable_upload_url, data=self._buf, headers=headers) + response = self._session.put(self._resumable_upload_url, data=self._current_part, headers=headers) if response.status_code not in _SUCCESSFUL_STATUS_CODES: logger.error("upload failed with status %s", response.status_code) @@ -473,14 +473,12 @@ def _upload_next_part(self): logger.debug("upload of part #%i finished" % part_num) self._total_parts += 1 - self._buf = io.BytesIO() + self._current_part = io.BytesIO() def _upload_empty_part(self): logger.info("creating empty file") - headers = { - 'Content-Length': '0' - } - response = self._session.put(self._resumeable_upload_url, headers=headers) + headers = {'Content-Length': '0'} + response = self._session.put(self._resumable_upload_url, headers=headers) assert response.status_code in _SUCCESSFUL_STATUS_CODES self._total_parts += 1 @@ -495,8 +493,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.close() def __str__(self): - return "(%s, %r, %r)" % \ - (self.__class__.__name__, self._bucket.name, self._blob.name) + return "(%s, %r, %r)" % (self.__class__.__name__, self._bucket.name, self._blob.name) def __repr__(self): return ( diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 0778eca8..2451a88e 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -520,14 +520,14 @@ def test_write_03(self): ) with smart_open_write as fout: fout.write(b"t" * 262141) - self.assertEqual(fout._buf.tell(), 262141) + self.assertEqual(fout._current_part.tell(), 262141) fout.write(b"t\n") - self.assertEqual(fout._buf.tell(), 262143) + self.assertEqual(fout._current_part.tell(), 262143) self.assertEqual(fout._total_parts, 0) fout.write(b"t") - self.assertEqual(fout._buf.tell(), 0) + self.assertEqual(fout._current_part.tell(), 0) self.assertEqual(fout._total_parts, 1) # read back the same key and check its content From fb44cd25b227616dfececa2be8fdaecb8f1fc950 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Wed, 8 Jan 2020 17:05:22 -0500 Subject: [PATCH 65/84] Add additional assertion for min_part_size --- smart_open/gcs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 196a09d0..27e45c18 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -370,6 +370,7 @@ def __init__( self._bucket = self._client.bucket(bucket) # type: google.cloud.storage.Bucket self._blob = self._bucket.blob(blob) # type: google.cloud.storage.Blob assert min_part_size % _REQUIRED_CHUNK_MULTIPLE == 0, 'min part size must be a multiple of 256KB' + assert min_part_size >= _MIN_MIN_PART_SIZE, 'min part size must be greater than 256KB' self._min_part_size = min_part_size self._total_size = 0 From f17b9b170b48776973d3fe3f578d595872c84b27 Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 18:07:42 -0500 Subject: [PATCH 66/84] Clean up _upload_next_part --- smart_open/gcs.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 27e45c18..beca4408 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -449,15 +449,9 @@ def _upload_next_part(self): part_num, self._current_part.tell(), self._total_size / 1024.0 ** 3) - content_length = self._current_part.tell() + content_length = end = self._current_part.tell() start = self._total_size - content_length stop = self._total_size - 1 - if stop - start + 1 == content_length: - end = content_length - else: - end = _UNKNOWN_FILE_SIZE - if content_length % _REQUIRED_CHUNK_MULTIPLE != 0: - stop = content_length % _REQUIRED_CHUNK_MULTIPLE - 1 self._current_part.seek(0) From 5b8629150758aa8516266496322bfe16693691a5 Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 18:27:57 -0500 Subject: [PATCH 67/84] Improve UploadFailedError --- smart_open/gcs.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index beca4408..540a6eeb 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -61,8 +61,23 @@ def _make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): return 'bytes %d-%d/%s' % (start, stop, end) -class _UploadFailedError(Exception): - """Raised when a multi-part upload to GCS returns a failed response status code.""" +class UploadFailedError(Exception): + def __init__(self, message, status_code, text): + """Raise when a multi-part upload to GCS returns a failed response status code. + + Parameters + ---------- + message: str + The error message to display. + status_code: str + The status code returned from the upload response. + text: str + The text returned from the upload response. + + """ + super(UploadFailedError, self).__init__(message) + self.status_code = status_code + self.text = text def open( @@ -462,9 +477,10 @@ def _upload_next_part(self): response = self._session.put(self._resumable_upload_url, data=self._current_part, headers=headers) if response.status_code not in _SUCCESSFUL_STATUS_CODES: - logger.error("upload failed with status %s", response.status_code) - logger.error("response message: %s", response.text) - raise _UploadFailedError + status_code, text = response.status_code, response.text + msg = "upload failed with status code: %s, response text: %s, part #%i, %i bytes (total %.3fGB)" % \ + status_code, text, part_num, self._current_part.tell(), self._total_size / 1024.0 ** 3 + raise UploadFailedError(msg, status_code, text) logger.debug("upload of part #%i finished" % part_num) self._total_parts += 1 From 52cb2d10e064d73c2d96f973e6148f9b230f393c Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 20:37:41 -0500 Subject: [PATCH 68/84] Add docstring to terminate --- smart_open/gcs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 540a6eeb..1ba2b022 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -450,6 +450,7 @@ def write(self, b): return len(b) def terminate(self): + """Cancel the underlying resumable upload.""" # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#example_cancelling_an_upload # From e02fdabf6aa6b19e156a59d2e4e8b48918dd6503 Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 20:44:41 -0500 Subject: [PATCH 69/84] Add additional clean up to close --- smart_open/gcs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 1ba2b022..d6ef7207 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -211,6 +211,8 @@ def close(self): """Flush and close this stream.""" logger.debug("close: called") self._blob = None + self._current_part = None + self._raw_reader def readable(self): """Return True if the stream can be read from.""" From dcef33dc510b47f3b0939b64298bdc5d3151b05b Mon Sep 17 00:00:00 2001 From: Peter Date: Wed, 8 Jan 2020 20:48:27 -0500 Subject: [PATCH 70/84] Remove useless terminate in SeekableBufferedInputBase --- smart_open/gcs.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index d6ef7207..5f8f844c 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -327,10 +327,6 @@ def readline(self, limit=-1): self._fill_buffer() return the_line.getvalue() - def terminate(self): - """Do nothing.""" - pass - # # Internal methods. # From 8ddb98db419371119401ac52cea6a436610f7459 Mon Sep 17 00:00:00 2001 From: Peter Date: Thu, 9 Jan 2020 08:18:22 -0500 Subject: [PATCH 71/84] Fix data type for status_code in UploadFailedError --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 5f8f844c..ca38493f 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -69,7 +69,7 @@ def __init__(self, message, status_code, text): ---------- message: str The error message to display. - status_code: str + status_code: int The status code returned from the upload response. text: str The text returned from the upload response. From c3fc44a3a0bc2e120550b671bdb738bf2e565de9 Mon Sep 17 00:00:00 2001 From: Peter Date: Thu, 9 Jan 2020 08:22:06 -0500 Subject: [PATCH 72/84] Clean up UploadedFailedError msg --- smart_open/gcs.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index ca38493f..e0e77401 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -477,8 +477,19 @@ def _upload_next_part(self): if response.status_code not in _SUCCESSFUL_STATUS_CODES: status_code, text = response.status_code, response.text - msg = "upload failed with status code: %s, response text: %s, part #%i, %i bytes (total %.3fGB)" % \ - status_code, text, part_num, self._current_part.tell(), self._total_size / 1024.0 ** 3 + msg = ( + "upload failed (" + "status code: %i" + "response text=%s, " + "part #%i, " + "%i bytes (total %.3fGB)" + ) % ( + status_code, + text, + part_num, + self._current_part.tell(), + self._total_size / 1024.0 ** 3, + ) raise UploadFailedError(msg, status_code, text) logger.debug("upload of part #%i finished" % part_num) From 7cc0509eab7da49840b8d34fd30a7462d4af6053 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Sun, 12 Jan 2020 08:54:43 -0500 Subject: [PATCH 73/84] Start on mock tests --- smart_open/tests/test_gcs.py | 95 +++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 2451a88e..3c413293 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -18,6 +18,7 @@ except ImportError: import mock import warnings +from collections import OrderedDict import google.cloud import google.api_core.exceptions @@ -50,33 +51,76 @@ class FakeBucket(object): def __init__(self, client, name=None): self.client = client # type: FakeClient self.name = name - self.blobs = [] + self._blobs = OrderedDict() self._exists = True + self.client.add_bucket(self) + def blob(self, blob_id): - blob = next((x for x in self.blobs if x.name == blob_id), None) - if blob is None: - blob = FakeBlob(blob_id, self) - return blob + return self._blobs.get(blob_id, FakeBlob(blob_id, self)) def delete(self): self.client._delete_bucket(self) self._exists = False + for blob in list(self._blobs.values()): + blob.delete() def exists(self): return self._exists def get_blob(self, blob_id): - blob = next((x for x in self.blobs if x.name == blob_id), None) - if blob is None: + try: + return self._blobs[blob_id] + except KeyError: raise google.cloud.exceptions.NotFound('Blob {} not found'.format(blob_id)) - return blob def list_blobs(self): - return self.blobs + return list(self._blobs.values()) def _delete_blob(self, blob): - self.blobs.remove(blob) + del self._blobs[blob.name] + + +class FakeBucketTest(unittest.TestCase): + def setUp(self): + self.client = FakeClient() + self.bucket = FakeBucket(self.client, 'test-bucket') + + def test_blob(self): + blob_id = 'blob.txt' + expected = FakeBlob(blob_id, self.bucket) + actual = self.bucket.blob(blob_id) + self.assertEqual(actual, expected) + + def test_create_blob(self): + blob_id = 'blob.txt' + expected = self.bucket.blob(blob_id) + actual = self.bucket.list_blobs()[0] + self.assertEqual(actual, expected) + + def test_delete(self): + blob_id = 'blob.txt' + blob = FakeBlob(blob_id, self.bucket) + self.bucket.delete() + self.assertFalse(self.bucket.exists()) + self.assertFalse(blob.exists()) + + def test_get_blob(self): + blob_one_id = 'blob_one.avro' + blob_two_id = 'blob_two.parquet' + blob_one = self.bucket.blob(blob_one_id) + blob_two = self.bucket.blob(blob_two_id) + actual_first_blob = self.bucket.get_blob(blob_one_id) + actual_second_blob = self.bucket.get_blob(blob_two_id) + self.assertEqual(actual_first_blob, blob_one) + self.assertEqual(actual_second_blob, blob_two) + + def test_list_blobs(self): + blob_one = self.bucket.blob('blob_one.avro') + blob_two = self.bucket.blob('blob_two.parquet') + actual = self.bucket.list_blobs() + expected = [blob_one, blob_two] + self.assertEqual(actual, expected) class FakeBlob(object): @@ -99,7 +143,7 @@ def create_resumable_upload_session(self): def delete(self): self._bucket._delete_blob(self) - self._created = False + self._exists = False def download_as_string(self, start=None, end=None): if start is None: @@ -112,11 +156,11 @@ def download_as_string(self, start=None, end=None): def exists(self, client=None): return self._exists - def upload_from_string(self, str): - self.__contents.write(str) + def upload_from_string(self, str_): + self.__contents.write(str_) def write(self, data): - self.__contents.write(data) + self.upload_from_string(data) @property def bucket(self): @@ -129,8 +173,8 @@ def size(self): return self.__contents.tell() def _create_if_not_exists(self): - if self not in self._bucket.blobs: - self._bucket.blobs.append(self) + if self.name not in self._bucket._blobs.keys(): + self._bucket._blobs[self.name] = self self._exists = True @@ -147,25 +191,28 @@ def __init__(self, credentials=None): if credentials is None: credentials = FakeCredentials(self) self._credentials = credentials # type: FakeCredentials - self._uploads = {} - self.__buckets = [] + self._uploads = OrderedDict() + self.__buckets = OrderedDict() def bucket(self, bucket_id): - bucket = next((x for x in self.__buckets if x.name == bucket_id), None) - if bucket is None: + try: + return self.__buckets[bucket_id] + except KeyError: raise google.cloud.exceptions.NotFound('Bucket {} not found'.format(bucket_id)) - return bucket def create_bucket(self, bucket_id): bucket = FakeBucket(self, bucket_id) - self.__buckets.append(bucket) + self.__buckets[bucket_id] = bucket return bucket def get_bucket(self, bucket_id): return self.bucket(bucket_id) + def add_bucket(self, bucket): + self.__buckets[bucket.name] = bucket + def _delete_bucket(self, bucket): - self.__buckets.remove(bucket) + del self.__buckets[bucket.name] class FakeBlobUpload(object): @@ -194,7 +241,7 @@ def put(self, url, data=None, headers=None): return FakeResponse() def _blob_with_url(self, url, client): - # type: (str, FakeClient) -> None + # type: (str, FakeClient) -> FakeBlobUpload return client._uploads.get(url) From bec8a6d57dc6546d0c631cea80bccfd60282e3d8 Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 20 Jan 2020 09:33:27 -0500 Subject: [PATCH 74/84] Add tests for mocks --- smart_open/gcs.py | 4 +- smart_open/tests/test_gcs.py | 177 +++++++++++++++++++++++++++++------ 2 files changed, 152 insertions(+), 29 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index e0e77401..6f8477d8 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -183,7 +183,7 @@ def __init__( line_terminator=_BINARY_NEWLINE, client=None, # type: google.cloud.storage.Client ): - if not client: + if client is None: client = google.cloud.storage.Client() bucket = client.get_bucket(bucket) # type: google.cloud.storage.Bucket @@ -379,7 +379,7 @@ def __init__( if client is None: client = google.cloud.storage.Client() self._client = client - self._credentials = self._client._credentials + self._credentials = self._client._credentials # noqa self._bucket = self._client.bucket(bucket) # type: google.cloud.storage.Bucket self._blob = self._bucket.blob(blob) # type: google.cloud.storage.Blob assert min_part_size % _REQUIRED_CHUNK_MULTIPLE == 0, 'min part size must be a multiple of 256KB' diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 3c413293..9f2c2e9f 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -33,9 +33,9 @@ RESUMABLE_SESSION_URI_TEMPLATE = ( 'https://www.googleapis.com/upload/storage/v1/b/' - '{bucket}' + '%(bucket)s' '/o?uploadType=resumable&upload_id=' - '{upload_id}' + '%(upload_id)s' ) logger = logging.getLogger(__name__) @@ -51,18 +51,21 @@ class FakeBucket(object): def __init__(self, client, name=None): self.client = client # type: FakeClient self.name = name - self._blobs = OrderedDict() + self.blobs = OrderedDict() self._exists = True - self.client.add_bucket(self) + # + # This is simpler than creating a backend and metaclass to store the state of every bucket created + # + self.client.register_bucket(self) def blob(self, blob_id): - return self._blobs.get(blob_id, FakeBlob(blob_id, self)) + return self.blobs.get(blob_id, FakeBlob(blob_id, self)) def delete(self): - self.client._delete_bucket(self) + self.client.delete_bucket(self) self._exists = False - for blob in list(self._blobs.values()): + for blob in list(self.blobs.values()): blob.delete() def exists(self): @@ -70,15 +73,15 @@ def exists(self): def get_blob(self, blob_id): try: - return self._blobs[blob_id] + return self.blobs[blob_id] except KeyError: raise google.cloud.exceptions.NotFound('Blob {} not found'.format(blob_id)) def list_blobs(self): - return list(self._blobs.values()) + return list(self.blobs.values()) - def _delete_blob(self, blob): - del self._blobs[blob.name] + def delete_blob(self, blob): + del self.blobs[blob.name] class FakeBucketTest(unittest.TestCase): @@ -115,6 +118,10 @@ def test_get_blob(self): self.assertEqual(actual_first_blob, blob_one) self.assertEqual(actual_second_blob, blob_two) + def test_get_nonexistent_blob(self): + with self.assertRaises(google.cloud.exceptions.NotFound): + self.bucket.get_blob('test-blob ') + def test_list_blobs(self): blob_one = self.bucket.blob('blob_one.avro') blob_two = self.bucket.blob('blob_two.parquet') @@ -136,13 +143,13 @@ def __init__(self, name, bucket, create=True): def create_resumable_upload_session(self): resumeable_upload_url = RESUMABLE_SESSION_URI_TEMPLATE % dict( bucket=self._bucket.name, - upload_id=uuid.uuid4().hex, + upload_id=str(uuid.uuid4()), ) - self._bucket.client._uploads[resumeable_upload_url] = self + self._bucket.client.uploads[resumeable_upload_url] = FakeBlobUpload(resumeable_upload_url, self) return resumeable_upload_url def delete(self): - self._bucket._delete_blob(self) + self._bucket.delete_blob(self) self._exists = False def download_as_string(self, start=None, end=None): @@ -173,11 +180,43 @@ def size(self): return self.__contents.tell() def _create_if_not_exists(self): - if self.name not in self._bucket._blobs.keys(): - self._bucket._blobs[self.name] = self + if self.name not in self._bucket.blobs.keys(): + self._bucket.blobs[self.name] = self self._exists = True +class FakeBlobTest(unittest.TestCase): + def setUp(self): + self.client = FakeClient() + self.bucket = FakeBucket(self.client, 'test-bucket') + + def test_create_resumable_upload_session(self): + blob = FakeBlob('fake-blob', self.bucket) + resumable_upload_url = blob.create_resumable_upload_session() + self.assertTrue(resumable_upload_url in self.client.uploads) + + def test_delete(self): + blob = FakeBlob('fake-blob', self.bucket) + blob.delete() + self.assertFalse(blob.exists()) + self.assertEqual(self.bucket.list_blobs(), []) + + def test_upload_download(self): + blob = FakeBlob('fake-blob', self.bucket) + contents = b'test' + blob.upload_from_string(contents) + self.assertEqual(blob.download_as_string(), b'test') + self.assertEqual(blob.download_as_string(start=2), b'st') + self.assertEqual(blob.download_as_string(end=2), b'te') + self.assertEqual(blob.download_as_string(start=2, end=3), b's') + + def test_size(self): + blob = FakeBlob('fake-blob', self.bucket) + self.assertEqual(blob.size, None) + blob.upload_from_string(b'test') + self.assertEqual(blob.size, 4) + + class FakeCredentials(object): def __init__(self, client): self.client = client # type: FakeClient @@ -191,34 +230,75 @@ def __init__(self, credentials=None): if credentials is None: credentials = FakeCredentials(self) self._credentials = credentials # type: FakeCredentials - self._uploads = OrderedDict() + self.uploads = OrderedDict() self.__buckets = OrderedDict() def bucket(self, bucket_id): try: return self.__buckets[bucket_id] except KeyError: - raise google.cloud.exceptions.NotFound('Bucket {} not found'.format(bucket_id)) + raise google.cloud.exceptions.NotFound('Bucket %s not found' % bucket_id) def create_bucket(self, bucket_id): bucket = FakeBucket(self, bucket_id) - self.__buckets[bucket_id] = bucket return bucket def get_bucket(self, bucket_id): return self.bucket(bucket_id) - def add_bucket(self, bucket): + def register_bucket(self, bucket): + if bucket.name in self.__buckets: + raise google.cloud.exceptions.Conflict('Bucket %s already exists' % bucket.name) self.__buckets[bucket.name] = bucket - def _delete_bucket(self, bucket): + def delete_bucket(self, bucket): del self.__buckets[bucket.name] +class FakeClientTest(unittest.TestCase): + def setUp(self): + self.client = FakeClient() + + def test_nonexistent_bucket(self): + with self.assertRaises(google.cloud.exceptions.NotFound): + self.client.bucket('test-bucket') + + def test_bucket(self): + bucket_id = 'test-bucket' + bucket = FakeBucket(self.client, bucket_id) + actual = self.client.bucket(bucket_id) + self.assertEqual(actual, bucket) + + def test_duplicate_bucket(self): + bucket_id = 'test-bucket' + FakeBucket(self.client, bucket_id) + with self.assertRaises(google.cloud.exceptions.Conflict): + FakeBucket(self.client, bucket_id) + + def test_create_bucket(self): + bucket_id = 'test-bucket' + bucket = self.client.create_bucket(bucket_id) + actual = self.client.get_bucket(bucket_id) + self.assertEqual(actual, bucket) + + class FakeBlobUpload(object): def __init__(self, url, blob): self.url = url self.blob = blob # type: FakeBlob + self.__contents = io.BytesIO() + + def write(self, data): + self.__contents.write(data) + + def finish(self): + self.__contents.seek(0) + data = self.__contents.read() + self.blob.upload_from_string(data) + + def terminate(self): + self.blob.delete() + self.__contents = None class FakeResponse(object): @@ -231,18 +311,61 @@ def __init__(self, credentials): self._credentials = credentials # type: FakeCredentials def delete(self, upload_url): - blob = self._credentials.client._uploads[upload_url] - blob.delete() - self._credentials.client._uploads.pop(upload_url) + upload = self._credentials.client.uploads.pop(upload_url) + upload.terminate() def put(self, url, data=None, headers=None): if data is not None: - self._credentials.client._uploads[url].write(data.read()) + upload = self._credentials.client.uploads[url] + upload.write(data.read()) + if not headers['Content-Range'].endswith(smart_open.gcs._UNKNOWN_FILE_SIZE): + upload.finish() return FakeResponse() - def _blob_with_url(self, url, client): + @staticmethod + def _blob_with_url(url, client): # type: (str, FakeClient) -> FakeBlobUpload - return client._uploads.get(url) + return client.uploads.get(url) + + +class FakeAuthorizedSessionTest(unittest.TestCase): + def setUp(self): + self.client = FakeClient() + self.credentials = FakeCredentials(self.client) + self.session = FakeAuthorizedSession(self.credentials) + self.bucket = FakeBucket(self.client, 'test-bucket') + self.blob = FakeBlob('test-blob', self.bucket) + self.upload_url = self.blob.create_resumable_upload_session() + + def test_delete(self): + self.session.delete(self.upload_url) + self.assertFalse(self.blob.exists()) + self.assertDictEqual(self.client.uploads, {}) + + def test_unfinished_put_does_not_write_to_blob(self): + data = io.BytesIO(b'test') + headers = { + 'Content-Range': 'bytes 0-3/*', + 'Content-Length': str(4), + } + response = self.session.put(self.upload_url, data, headers=headers) + self.assertEqual(response.status_code, 200) + self.session._blob_with_url(self.upload_url, self.client) + blob_contents = self.blob.download_as_string() + self.assertEqual(blob_contents, b'') + + def test_finished_put_writes_to_blob(self): + data = io.BytesIO(b'test') + headers = { + 'Content-Range': 'bytes 0-3/4', + 'Content-Length': str(4), + } + response = self.session.put(self.upload_url, data, headers=headers) + self.assertEqual(response.status_code, 200) + self.session._blob_with_url(self.upload_url, self.client) + blob_contents = self.blob.download_as_string() + data.seek(0) + self.assertEqual(blob_contents, data.read()) if DISABLE_MOCKS: From be21104111de8827f534a62c1930b93d03a0ebe1 Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 20 Jan 2020 09:56:30 -0500 Subject: [PATCH 75/84] Clean up registering dependencies --- smart_open/tests/test_gcs.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 9f2c2e9f..bbf12fed 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -83,19 +83,26 @@ def list_blobs(self): def delete_blob(self, blob): del self.blobs[blob.name] + def register_blob(self, blob): + if blob.name not in self.blobs.keys(): + self.blobs[blob.name] = blob + + def register_upload(self, upload): + self.client.register_upload(upload) + class FakeBucketTest(unittest.TestCase): def setUp(self): self.client = FakeClient() self.bucket = FakeBucket(self.client, 'test-bucket') - def test_blob(self): + def test_blob_registers_with_bucket(self): blob_id = 'blob.txt' expected = FakeBlob(blob_id, self.bucket) actual = self.bucket.blob(blob_id) self.assertEqual(actual, expected) - def test_create_blob(self): + def test_blob_alternate_constuctor(self): blob_id = 'blob.txt' expected = self.bucket.blob(blob_id) actual = self.bucket.list_blobs()[0] @@ -108,7 +115,7 @@ def test_delete(self): self.assertFalse(self.bucket.exists()) self.assertFalse(blob.exists()) - def test_get_blob(self): + def test_get_multiple_blobs(self): blob_one_id = 'blob_one.avro' blob_two_id = 'blob_two.parquet' blob_one = self.bucket.blob(blob_one_id) @@ -120,7 +127,7 @@ def test_get_blob(self): def test_get_nonexistent_blob(self): with self.assertRaises(google.cloud.exceptions.NotFound): - self.bucket.get_blob('test-blob ') + self.bucket.get_blob('test-blob') def test_list_blobs(self): blob_one = self.bucket.blob('blob_one.avro') @@ -145,7 +152,8 @@ def create_resumable_upload_session(self): bucket=self._bucket.name, upload_id=str(uuid.uuid4()), ) - self._bucket.client.uploads[resumeable_upload_url] = FakeBlobUpload(resumeable_upload_url, self) + upload = FakeBlobUpload(resumeable_upload_url, self) + self._bucket.register_upload(upload) return resumeable_upload_url def delete(self): @@ -180,8 +188,7 @@ def size(self): return self.__contents.tell() def _create_if_not_exists(self): - if self.name not in self._bucket.blobs.keys(): - self._bucket.blobs[self.name] = self + self._bucket.register_blob(self) self._exists = True @@ -254,6 +261,9 @@ def register_bucket(self, bucket): def delete_bucket(self, bucket): del self.__buckets[bucket.name] + def register_upload(self, upload): + self.uploads[upload.url] = upload + class FakeClientTest(unittest.TestCase): def setUp(self): From a760f7115c9f6fa1e1ab55e6c03ccba195fc4e2b Mon Sep 17 00:00:00 2001 From: Peter Date: Mon, 20 Jan 2020 10:29:05 -0500 Subject: [PATCH 76/84] Get initialize_bucket to work without gsutil --- integration-tests/test_gcs.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index a8a49988..c5e2e653 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- import io import os -import subprocess + +import google.cloud.storage +from six.moves.urllib import parse as urlparse import smart_open @@ -10,7 +12,14 @@ def initialize_bucket(): - subprocess.check_call(['gsutil' 'rm', '-r', _GCS_URL]) + client = google.cloud.storage.Client() + parsed = urlparse.urlparse(_GCS_URL) + bucket_name = parsed.netloc + prefix = parsed.path + bucket = client.get_bucket(bucket_name) + blobs = bucket.list_blobs(prefix=prefix) + for blob in blobs: + blob.delete() def write_read(key, content, write_mode, read_mode, **kwargs): From dadda4922c4a2b5017f90a9ae295eaac7877e3e6 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 11:31:03 -0500 Subject: [PATCH 77/84] Change buffering to buffer_size --- smart_open/gcs.py | 12 ++++++------ smart_open/tests/test_gcs.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 6f8477d8..b042d47b 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -84,7 +84,7 @@ def open( bucket_id, blob_id, mode, - buffering=DEFAULT_BUFFER_SIZE, + buffer_size=DEFAULT_BUFFER_SIZE, min_part_size=_MIN_MIN_PART_SIZE, client=None, # type: google.cloud.storage.Client ): @@ -98,7 +98,7 @@ def open( The name of the blob within the bucket. mode: str The mode for opening the object. Must be either "rb" or "wb". - buffering: int, optional + buffer_size: int, optional The buffer size to use when performing I/O. For reading only. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. @@ -110,7 +110,7 @@ def open( return SeekableBufferedInputBase( bucket_id, blob_id, - buffering=buffering, + buffer_size=buffer_size, line_terminator=_BINARY_NEWLINE, client=client, ) @@ -179,7 +179,7 @@ def __init__( self, bucket, key, - buffering=DEFAULT_BUFFER_SIZE, + buffer_size=DEFAULT_BUFFER_SIZE, line_terminator=_BINARY_NEWLINE, client=None, # type: google.cloud.storage.Client ): @@ -194,8 +194,8 @@ def __init__( self._raw_reader = _SeekableRawReader(self._blob, self._size) self._current_pos = 0 - self._current_part_size = buffering - self._current_part = smart_open.bytebuffer.ByteBuffer(buffering) + self._current_part_size = buffer_size + self._current_part = smart_open.bytebuffer.ByteBuffer(buffer_size) self._eof = False self._line_terminator = line_terminator diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index bbf12fed..54bd94a8 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -625,7 +625,7 @@ def test_readline_tiny_buffer(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) - with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME, buffering=8) as fin: + with smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: actual = list(fin) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] From 343d5a2d5bb9f438cef0651cef7d8391ca19a422 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 11:35:19 -0500 Subject: [PATCH 78/84] Add copyright header, fix logging styles, and move result outside ctx manager --- integration-tests/test_gcs.py | 3 ++- smart_open/gcs.py | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/integration-tests/test_gcs.py b/integration-tests/test_gcs.py index c5e2e653..ebf1f0e3 100644 --- a/integration-tests/test_gcs.py +++ b/integration-tests/test_gcs.py @@ -30,8 +30,9 @@ def write_read(key, content, write_mode, read_mode, **kwargs): def read_length_prefixed_messages(key, read_mode, **kwargs): + result = io.BytesIO() + with smart_open.open(key, read_mode, **kwargs) as fin: - result = io.BytesIO() length_byte = fin.read(1) while len(length_byte): result.write(length_byte) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index b042d47b..50781025 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -1,4 +1,10 @@ # -*- coding: utf-8 -*- +# +# Copyright (C) 2019 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# """Implements file-like objects for reading and writing to/from GCS.""" import io @@ -459,10 +465,12 @@ def terminate(self): # def _upload_next_part(self): part_num = self._total_parts + 1 - logger.info("uploading part #%i, %i bytes (total %.3fGB)", - part_num, - self._current_part.tell(), - self._total_size / 1024.0 ** 3) + logger.info( + "uploading part #%i, %i bytes (total %.3fGB)", + part_num, + self._current_part.tell(), + self._total_size / 1024.0 ** 3 + ) content_length = end = self._current_part.tell() start = self._total_size - content_length stop = self._total_size - 1 @@ -476,7 +484,6 @@ def _upload_next_part(self): response = self._session.put(self._resumable_upload_url, data=self._current_part, headers=headers) if response.status_code not in _SUCCESSFUL_STATUS_CODES: - status_code, text = response.status_code, response.text msg = ( "upload failed (" "status code: %i" @@ -484,13 +491,13 @@ def _upload_next_part(self): "part #%i, " "%i bytes (total %.3fGB)" ) % ( - status_code, - text, + response.status_code, + response.text, part_num, self._current_part.tell(), self._total_size / 1024.0 ** 3, ) - raise UploadFailedError(msg, status_code, text) + raise UploadFailedError(msg, response.status_code, response.text) logger.debug("upload of part #%i finished" % part_num) self._total_parts += 1 From 8373740ab1e7968ca917d1f5fbe8cefcebe15c25 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 11:35:55 -0500 Subject: [PATCH 79/84] Change _upload_empty_part to debug msg --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 50781025..6841ff5a 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -504,7 +504,7 @@ def _upload_next_part(self): self._current_part = io.BytesIO() def _upload_empty_part(self): - logger.info("creating empty file") + logger.debug("creating empty file") headers = {'Content-Length': '0'} response = self._session.put(self._resumable_upload_url, headers=headers) assert response.status_code in _SUCCESSFUL_STATUS_CODES From ab4d936db037963ff0a75ce1ffdd4b32b77662f8 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 11:37:38 -0500 Subject: [PATCH 80/84] Clean up patching style --- smart_open/tests/test_gcs.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 54bd94a8..df8c4c06 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -435,13 +435,10 @@ def mock_gcs(class_or_func): def mock_gcs_func(func): """Mock the function and provide additional required arguments.""" def inner(*args, **kwargs): - with mock.patch( - 'google.cloud.storage.Client', - return_value=storage_client, - ), \ + with mock.patch('google.cloud.storage.Client', return_value=storage_client), \ mock.patch( - 'smart_open.gcs.google_requests.AuthorizedSession', - return_value=FakeAuthorizedSession(storage_client._credentials), + 'smart_open.gcs.google_requests.AuthorizedSession', + return_value=FakeAuthorizedSession(storage_client._credentials), ): assert callable(func), 'you didn\'t provide a function!' try: # is it a method that needs a self arg? From f8dd13b0ce953830ecb3f4b231167dba148b9b8f Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 11:43:32 -0500 Subject: [PATCH 81/84] Add tests for smart_open_lib --- smart_open/tests/test_smart_open.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index 2ea0a359..014e56c0 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -40,7 +40,7 @@ class ParseUriTest(unittest.TestCase): def test_scheme(self): """Do URIs schemes parse correctly?""" # supported schemes - for scheme in ("s3", "s3a", "s3n", "hdfs", "file", "http", "https"): + for scheme in ("s3", "s3a", "s3n", "hdfs", "file", "http", "https", "gs"): parsed_uri = smart_open_lib._parse_uri(scheme + "://mybucket/mykey") self.assertEqual(parsed_uri.scheme, scheme) @@ -273,6 +273,24 @@ def test_ssh_complex_password_with_colon(self): uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.password, 'some:complex@password$$') + def test_gs_uri(self): + """Do GCS URIs parse correctly?""" + # correct uri without credentials + parsed_uri = smart_open_lib._parse_uri("gs://mybucket/myblob") + self.assertEqual(parsed_uri.scheme, "gs") + self.assertEqual(parsed_uri.bucket_id, "mybucket") + self.assertEqual(parsed_uri.blob_id, "myblob") + self.assertEqual(parsed_uri.access_id, None) + self.assertEqual(parsed_uri.access_secret, None) + + def test_gs_uri_contains_slash(self): + parsed_uri = smart_open_lib._parse_uri("gs://mybucket/mydir/myblob") + self.assertEqual(parsed_uri.scheme, "gs") + self.assertEqual(parsed_uri.bucket_id, "mybucket") + self.assertEqual(parsed_uri.blob_id, "mydir/myblob") + self.assertEqual(parsed_uri.access_id, None) + self.assertEqual(parsed_uri.access_secret, None) + class SmartOpenHttpTest(unittest.TestCase): """ From 1d2613c3c308e76644ae87cf3a523dc78fa9d5c9 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 11:45:26 -0500 Subject: [PATCH 82/84] Add blank lines before constants to help readability --- smart_open/gcs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 6841ff5a..1973f2f9 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -23,11 +23,13 @@ _READ_BINARY = 'rb' _WRITE_BINARY = 'wb' + _MODES = (_READ_BINARY, _WRITE_BINARY) """Allowed I/O modes for working with GCS.""" _BINARY_TYPES = (six.binary_type, bytearray) """Allowed binary buffer types for writing to the underlying GCS stream""" + if sys.version_info >= (2, 7): _BINARY_TYPES = (six.binary_type, bytearray, memoryview) @@ -49,10 +51,13 @@ START = 0 """Seek to the absolute start of a GCS file""" + CURRENT = 1 """Seek relative to the current positive of a GCS file""" + END = 2 """Seek relative to the end of a GCS file""" + _WHENCE_CHOICES = (START, CURRENT, END) _SUCCESSFUL_STATUS_CODES = (200, 201) From 9ec0b401bb48888f897b9e4f517611850012b98c Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 11:47:52 -0500 Subject: [PATCH 83/84] Add missing clean up of raw_reader in close --- smart_open/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 1973f2f9..dd33ae39 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -223,7 +223,7 @@ def close(self): logger.debug("close: called") self._blob = None self._current_part = None - self._raw_reader + self._raw_reader = None def readable(self): """Return True if the stream can be read from.""" From a8c272e60973854379d4c0e999630f5de10ff5c8 Mon Sep 17 00:00:00 2001 From: Peter Dannemann Date: Thu, 23 Jan 2020 12:04:33 -0500 Subject: [PATCH 84/84] Remove aws related credentials from gs tests --- smart_open/tests/test_smart_open.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index 014e56c0..d16ac4e3 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -280,16 +280,12 @@ def test_gs_uri(self): self.assertEqual(parsed_uri.scheme, "gs") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.blob_id, "myblob") - self.assertEqual(parsed_uri.access_id, None) - self.assertEqual(parsed_uri.access_secret, None) def test_gs_uri_contains_slash(self): parsed_uri = smart_open_lib._parse_uri("gs://mybucket/mydir/myblob") self.assertEqual(parsed_uri.scheme, "gs") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.blob_id, "mydir/myblob") - self.assertEqual(parsed_uri.access_id, None) - self.assertEqual(parsed_uri.access_secret, None) class SmartOpenHttpTest(unittest.TestCase):