From 68562731be890bb508966e346b9a2bd973de3f11 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Fri, 6 May 2016 11:48:28 -0400 Subject: [PATCH 1/6] Ensure that mulitpart bodies are always bytes. Fixes #1760. --- gcloud/bigquery/test_table.py | 11 ++++++----- gcloud/streaming/test_transfer.py | 24 ++++++++++++++++++------ gcloud/streaming/transfer.py | 24 +++++++++++++----------- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/gcloud/bigquery/test_table.py b/gcloud/bigquery/test_table.py index 0f2ca4ea9e80..9c4b235bcbc8 100644 --- a/gcloud/bigquery/test_table.py +++ b/gcloud/bigquery/test_table.py @@ -1351,12 +1351,14 @@ def _upload_from_file_helper(self, **kw): return conn.http._requested, PATH, BODY def test_upload_from_file_w_bound_client_multipart(self): - from email.parser import Parser import json from six.moves.urllib.parse import parse_qsl from six.moves.urllib.parse import urlsplit + from gcloud._helpers import _to_bytes + from gcloud.streaming.test_transfer import _email_chunk_parser requested, PATH, BODY = self._upload_from_file_helper() + parse_chunk = _email_chunk_parser() self.assertEqual(len(requested), 1) req = requested[0] @@ -1369,18 +1371,17 @@ def test_upload_from_file_w_bound_client_multipart(self): self.assertEqual(dict(parse_qsl(qs)), {'uploadType': 'multipart'}) - parser = Parser() ctype, boundary = [x.strip() for x in req['headers']['content-type'].split(';')] self.assertEqual(ctype, 'multipart/related') self.assertTrue(boundary.startswith('boundary="==')) self.assertTrue(boundary.endswith('=="')) - divider = '--' + boundary[len('boundary="'):-1] + divider = b'--' + _to_bytes(boundary[len('boundary="'):-1]) chunks = req['body'].split(divider)[1:-1] # discard prolog / epilog self.assertEqual(len(chunks), 2) - text_msg = parser.parsestr(chunks[0].strip()) + text_msg = parse_chunk(chunks[0].strip()) self.assertEqual(dict(text_msg._headers), {'Content-Type': 'application/json', 'MIME-Version': '1.0'}) @@ -1394,7 +1395,7 @@ def test_upload_from_file_w_bound_client_multipart(self): self.assertEqual(load_config['destinationTable'], DESTINATION_TABLE) self.assertEqual(load_config['sourceFormat'], 'CSV') - app_msg = parser.parsestr(chunks[1].strip()) + app_msg = parse_chunk(chunks[1].strip()) self.assertEqual(dict(app_msg._headers), {'Content-Type': 'application/octet-stream', 'Content-Transfer-Encoding': 'binary', diff --git a/gcloud/streaming/test_transfer.py b/gcloud/streaming/test_transfer.py index 5bd7eab19411..be1dc4077c96 100644 --- a/gcloud/streaming/test_transfer.py +++ b/gcloud/streaming/test_transfer.py @@ -1029,7 +1029,7 @@ def test_configure_request_w_simple_wo_body(self): self.assertEqual(request.loggable_body, '') def test_configure_request_w_simple_w_body(self): - from email.parser import Parser + from gcloud._helpers import _to_bytes from gcloud.streaming.transfer import SIMPLE_UPLOAD CONTENT = b'CONTENT' BODY = b'BODY' @@ -1045,7 +1045,6 @@ def test_configure_request_w_simple_w_body(self): self.assertEqual(url_builder.query_params, {'uploadType': 'multipart'}) self.assertEqual(url_builder.relative_path, config.simple_path) - parser = Parser() self.assertEqual(list(request.headers), ['content-type']) ctype, boundary = [x.strip() for x in request.headers['content-type'].split(';')] @@ -1053,23 +1052,24 @@ def test_configure_request_w_simple_w_body(self): self.assertTrue(boundary.startswith('boundary="==')) self.assertTrue(boundary.endswith('=="')) - divider = '--' + boundary[len('boundary="'):-1] + divider = b'--' + _to_bytes(boundary[len('boundary="'):-1]) chunks = request.body.split(divider)[1:-1] # discard prolog / epilog self.assertEqual(len(chunks), 2) - text_msg = parser.parsestr(chunks[0].strip()) + parse_chunk = _email_chunk_parser() + text_msg = parse_chunk(chunks[0].strip()) self.assertEqual(dict(text_msg._headers), {'Content-Type': 'text/plain', 'MIME-Version': '1.0'}) self.assertEqual(text_msg._payload, BODY.decode('ascii')) - app_msg = parser.parsestr(chunks[1].strip()) + app_msg = parse_chunk(chunks[1].strip()) self.assertEqual(dict(app_msg._headers), {'Content-Type': self.MIME_TYPE, 'Content-Transfer-Encoding': 'binary', 'MIME-Version': '1.0'}) self.assertEqual(app_msg._payload, CONTENT.decode('ascii')) - self.assertTrue('' in request.loggable_body) + self.assertTrue(b'' in request.loggable_body) def test_configure_request_w_resumable_wo_total_size(self): from gcloud.streaming.transfer import RESUMABLE_UPLOAD @@ -1784,6 +1784,18 @@ def test__send_chunk_w_total_size_stream_exhausted(self): self.assertEqual(end, SIZE) +def _email_chunk_parser(): + import six + if six.PY3: # pragma: NO COVER Python3 + from email.parser import BytesParser # pylint: disable=E0611 + parser = BytesParser() + return parser.parsebytes + else: + from email.parser import Parser + parser = Parser() + return parser.parsestr + + class _Dummy(object): def __init__(self, **kw): self.__dict__.update(kw) diff --git a/gcloud/streaming/transfer.py b/gcloud/streaming/transfer.py index 48b8bad53488..6a546639bebd 100644 --- a/gcloud/streaming/transfer.py +++ b/gcloud/streaming/transfer.py @@ -9,6 +9,7 @@ import six from six.moves import http_client +from gcloud._helpers import _to_bytes from gcloud.streaming.buffered_stream import BufferedStream from gcloud.streaming.exceptions import CommunicationError from gcloud.streaming.exceptions import HttpError @@ -849,13 +850,13 @@ def _configure_multipart_request(self, http_request): msg.set_payload(self.stream.read()) msg_root.attach(msg) - # NOTE: We encode the body, but can't use - # `email.message.Message.as_string` because it prepends - # `> ` to `From ` lines. - # NOTE: We must use six.StringIO() instead of io.StringIO() since the - # `email` library uses cStringIO in Py2 and io.StringIO in Py3. - stream = six.StringIO() - generator = email_generator.Generator(stream, mangle_from_=False) + # NOTE: generate multipart message as bytes, not text + stream = six.BytesIO() + if six.PY3: # pragma: NO COVER Pythone + generator_class = email_generator.BytesGenerator + else: + generator_class = email_generator.Generator + generator = generator_class(stream, mangle_from_=False) generator.flatten(msg_root, unixfrom=False) http_request.body = stream.getvalue() @@ -863,10 +864,11 @@ def _configure_multipart_request(self, http_request): http_request.headers['content-type'] = ( 'multipart/related; boundary="%s"' % multipart_boundary) - body_components = http_request.body.split(multipart_boundary) - headers, _, _ = body_components[-2].partition('\n\n') - body_components[-2] = '\n\n'.join([headers, '\n\n--']) - http_request.loggable_body = multipart_boundary.join(body_components) + boundary_bytes = _to_bytes(multipart_boundary) + body_components = http_request.body.split(boundary_bytes) + headers, _, _ = body_components[-2].partition(b'\n\n') + body_components[-2] = b'\n\n'.join([headers, b'\n\n--']) + http_request.loggable_body = boundary_bytes.join(body_components) def _configure_resumable_request(self, http_request): """Helper for 'configure_request': set up resumable request.""" From 5cb6e09f9f886753c385e099e05680b9da205dd8 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Fri, 6 May 2016 14:28:16 -0400 Subject: [PATCH 2/6] Note that the file to be uploaded must be opened in binary mode. --- gcloud/bigquery/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcloud/bigquery/table.py b/gcloud/bigquery/table.py index 69597f5a3279..93863ed28547 100644 --- a/gcloud/bigquery/table.py +++ b/gcloud/bigquery/table.py @@ -742,7 +742,7 @@ def upload_from_file(self, - ``text/csv``. :type file_obj: file - :param file_obj: A file handle open for reading. + :param file_obj: A file handle opened in binary mode for reading. :type source_format: str :param source_format: one of 'CSV' or 'NEWLINE_DELIMITED_JSON'. From 9ef35487256f48222f13674cb2535a7217caa95b Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Fri, 6 May 2016 14:30:10 -0400 Subject: [PATCH 3/6] Use pylint checker name rather than code. Addresses: https://github.com/GoogleCloudPlatform/gcloud-python/pull/1779#discussion_r62353060 --- gcloud/streaming/test_transfer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gcloud/streaming/test_transfer.py b/gcloud/streaming/test_transfer.py index be1dc4077c96..0bd7bc180645 100644 --- a/gcloud/streaming/test_transfer.py +++ b/gcloud/streaming/test_transfer.py @@ -1787,7 +1787,9 @@ def test__send_chunk_w_total_size_stream_exhausted(self): def _email_chunk_parser(): import six if six.PY3: # pragma: NO COVER Python3 - from email.parser import BytesParser # pylint: disable=E0611 + # pylint: disable=no-name-in-module + from email.parser import BytesParser + # pylint: enable=no-name-in-module parser = BytesParser() return parser.parsebytes else: From f798af5e8496ea68b7556f0997faaff0d372d519 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Fri, 6 May 2016 14:31:25 -0400 Subject: [PATCH 4/6] Typo fix. Addresses: https://github.com/GoogleCloudPlatform/gcloud-python/pull/1779#discussion_r62353112 --- gcloud/streaming/transfer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcloud/streaming/transfer.py b/gcloud/streaming/transfer.py index 6a546639bebd..4c5b2a7a23f8 100644 --- a/gcloud/streaming/transfer.py +++ b/gcloud/streaming/transfer.py @@ -852,7 +852,7 @@ def _configure_multipart_request(self, http_request): # NOTE: generate multipart message as bytes, not text stream = six.BytesIO() - if six.PY3: # pragma: NO COVER Pythone + if six.PY3: # pragma: NO COVER Python3 generator_class = email_generator.BytesGenerator else: generator_class = email_generator.Generator From ceb1c07a9237d6919f1591e13cbfcdee822eb904 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Wed, 11 May 2016 14:26:50 -0400 Subject: [PATCH 5/6] Remove inline disabling of 'no-name-in-module' On 'master', it is now disabled globally. --- gcloud/streaming/test_transfer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gcloud/streaming/test_transfer.py b/gcloud/streaming/test_transfer.py index 0bd7bc180645..dfaa00be70cb 100644 --- a/gcloud/streaming/test_transfer.py +++ b/gcloud/streaming/test_transfer.py @@ -1787,9 +1787,7 @@ def test__send_chunk_w_total_size_stream_exhausted(self): def _email_chunk_parser(): import six if six.PY3: # pragma: NO COVER Python3 - # pylint: disable=no-name-in-module from email.parser import BytesParser - # pylint: enable=no-name-in-module parser = BytesParser() return parser.parsebytes else: From 4a8e95206fdb0532b8d6a5f5fdf3360b307ab9c7 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Wed, 11 May 2016 16:45:17 -0400 Subject: [PATCH 6/6] Raise ValueError if text-mode file is passed to 'upload_from_file'. Addresses: https://github.com/GoogleCloudPlatform/gcloud-python/pull/1779#issuecomment-218444196 --- gcloud/bigquery/table.py | 11 +++++++++-- gcloud/bigquery/test_table.py | 13 +++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/gcloud/bigquery/table.py b/gcloud/bigquery/table.py index 93863ed28547..df774c9f89ec 100644 --- a/gcloud/bigquery/table.py +++ b/gcloud/bigquery/table.py @@ -809,8 +809,9 @@ def upload_from_file(self, :rtype: :class:`gcloud.bigquery.jobs.LoadTableFromStorageJob` :returns: the job instance used to load the data (e.g., for querying status) - :raises: :class:`ValueError` if size is not passed in and can not be - determined + :raises: :class:`ValueError` if ``size`` is not passed in and can not + be determined, or if the ``file_obj`` can be detected to be + a file opened in text mode. """ client = self._require_client(client) connection = client.connection @@ -820,6 +821,12 @@ def upload_from_file(self, if rewind: file_obj.seek(0, os.SEEK_SET) + mode = getattr(file_obj, 'mode', None) + if mode is not None and mode != 'rb': + raise ValueError( + "Cannot upload files opened in text mode: use " + "open(filename, mode='rb')") + # Get the basic stats about the file. total_bytes = size if total_bytes is None: diff --git a/gcloud/bigquery/test_table.py b/gcloud/bigquery/test_table.py index 9c4b235bcbc8..10895952d92d 100644 --- a/gcloud/bigquery/test_table.py +++ b/gcloud/bigquery/test_table.py @@ -1289,6 +1289,19 @@ def _row_data(row): self.assertEqual(req['path'], '/%s' % PATH) self.assertEqual(req['data'], SENT) + def test_upload_from_file_text_mode_file_failure(self): + + class TextModeFile(object): + mode = 'r' + + conn = _Connection() + client = _Client(project=self.PROJECT, connection=conn) + dataset = _Dataset(client) + file_obj = TextModeFile() + table = self._makeOne(self.TABLE_NAME, dataset=dataset) + with self.assertRaises(ValueError): + table.upload_from_file(file_obj, 'CSV', size=1234) + def test_upload_from_file_size_failure(self): conn = _Connection() client = _Client(project=self.PROJECT, connection=conn)