From fcd49f6e455c1a231570b2c14842116d7f828f30 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 11 Jan 2020 15:23:29 +0900 Subject: [PATCH 01/32] Medium-scale refactoring Motivation: - smart_open_lib.py was cluttered - parsing logic was messy Main changes: - smart_open.parse_uri is now a publicly available function - Parsers for the individual schemes, e.g. HDFS, S3 now live in the corresponding submodule - Moved compression to a separate submodule I haven't changed any of the unit tests, in order to demonstrate that everything still works as expected from the outside. --- smart_open/compression.py | 109 +++++++ smart_open/hdfs.py | 18 ++ smart_open/http.py | 12 + smart_open/s3.py | 142 +++++++++- smart_open/smart_open_lib.py | 532 ++++++++++------------------------- smart_open/ssh.py | 23 +- smart_open/uri.py | 31 ++ smart_open/webhdfs.py | 33 ++- 8 files changed, 507 insertions(+), 393 deletions(-) create mode 100644 smart_open/compression.py create mode 100644 smart_open/uri.py diff --git a/smart_open/compression.py b/smart_open/compression.py new file mode 100644 index 00000000..223f87c3 --- /dev/null +++ b/smart_open/compression.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2020 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# +import io +import os.path + +import six + + +_COMPRESSOR_REGISTRY = {} + + +def get_supported_extensions(): + return sorted(_COMPRESSOR_REGISTRY.keys()) + + +def register_compressor(ext, callback): + """Register a callback for transparently decompressing files with a specific extension. + + Parameters + ---------- + ext: str + The extension. + callback: callable + The callback. It must accept two position arguments, file_obj and mode. + + Examples + -------- + + Instruct smart_open to use the identity function whenever opening a file + with a .xz extension (see README.rst for the complete example showing I/O): + + >>> def _handle_xz(file_obj, mode): + ... import lzma + ... return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) + >>> + >>> register_compressor('.xz', _handle_xz) + + """ + if not (ext and ext[0] == '.'): + raise ValueError('ext must be a string starting with ., not %r' % ext) + if ext in _COMPRESSOR_REGISTRY: + logger.warning('overriding existing compression handler for %r', ext) + _COMPRESSOR_REGISTRY[ext] = callback + + +def _handle_bz2(file_obj, mode): + if six.PY2: + from bz2file import BZ2File + else: + from bz2 import BZ2File + return BZ2File(file_obj, mode) + + +def _handle_gzip(file_obj, mode): + import gzip + return gzip.GzipFile(fileobj=file_obj, mode=mode) + + +def compression_wrapper(file_obj, filename, mode): + """ + This function will wrap the file_obj with an appropriate + [de]compression mechanism based on the extension of the filename. + + file_obj must either be a filehandle object, or a class which behaves + like one. + + If the filename extension isn't recognized, will simply return the original + file_obj. + """ + _, ext = os.path.splitext(filename) + + if _need_to_buffer(file_obj, mode, ext): + warnings.warn('streaming gzip support unavailable, see %s' % _ISSUE_189_URL) + file_obj = io.BytesIO(file_obj.read()) + if ext in _COMPRESSOR_REGISTRY and mode.endswith('+'): + raise ValueError('transparent (de)compression unsupported for mode %r' % mode) + + try: + callback = _COMPRESSOR_REGISTRY[ext] + except KeyError: + return file_obj + else: + return callback(file_obj, mode) + + +def _need_to_buffer(file_obj, mode, ext): + """Returns True if we need to buffer the whole file in memory in order to proceed.""" + try: + is_seekable = file_obj.seekable() + except AttributeError: + # + # Under Py2, built-in file objects returned by open do not have + # .seekable, but have a .seek method instead. + # + is_seekable = hasattr(file_obj, 'seek') + is_compressed = ext in _COMPRESSOR_REGISTRY + return six.PY2 and mode.startswith('r') and is_compressed and not is_seekable + + +# +# NB. avoid using lambda here to make stack traces more readable. +# +register_compressor('.bz2', _handle_bz2) +register_compressor('.gz', _handle_gzip) diff --git a/smart_open/hdfs.py b/smart_open/hdfs.py index 2485685f..0f3df482 100644 --- a/smart_open/hdfs.py +++ b/smart_open/hdfs.py @@ -18,8 +18,26 @@ import logging import subprocess +import smart_open.uri + +from six.moves.urllib import parse as urlparse + logger = logging.getLogger(__name__) +HDFS_SCHEME = 'hdfs' + + +def parse_uri(uri_as_string): + split_uri = urlparse.urlsplit(uri_as_string) + assert split_uri.scheme == HDFS_SCHEME + + uri_path = split_uri.netloc + split_uri.path + uri_path = "/" + uri_path.lstrip("/") + if not uri_path: + raise RuntimeError("invalid HDFS URI: %s" % str(parsed_uri)) + + return smart_open.uri.Uri(scheme=HDFS_SCHEME, uri_path=uri_path) + def open(uri, mode): if mode == 'rb': diff --git a/smart_open/http.py b/smart_open/http.py index 7530a942..3546f4d7 100644 --- a/smart_open/http.py +++ b/smart_open/http.py @@ -10,11 +10,14 @@ import io import logging +from six.moves.urllib import parse as urlparse import requests from smart_open import bytebuffer, s3 +import smart_open.uri DEFAULT_BUFFER_SIZE = 128 * 1024 +SUPPORTED_SCHEMES = ('http', 'https') logger = logging.getLogger(__name__) @@ -28,6 +31,15 @@ """ +def parse_uri(uri_as_string): + split_uri = urlparse.urlsplit(uri_as_string) + assert split_uri.scheme in SUPPORTED_SCHEMES + + uri_path = split_uri.netloc + split_uri.path + uri_path = "/" + uri_path.lstrip("/") + return smart_open.uri.Uri(scheme=split_uri.scheme, uri_path=uri_path) + + def open(uri, mode, kerberos=False, user=None, password=None, headers=None): """Implement streamed reader from a web site. diff --git a/smart_open/s3.py b/smart_open/s3.py index 80948ad2..89f26f01 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -13,13 +13,17 @@ import logging import warnings +import boto import boto3 import botocore.client import six +from six.moves.urllib import parse as urlparse +from botocore.exceptions import IncompleteReadError + import smart_open.bytebuffer +import smart_open.uri -from botocore.exceptions import IncompleteReadError logger = logging.getLogger(__name__) @@ -46,6 +50,8 @@ BINARY_NEWLINE = b'\n' SUPPORTED_SCHEMES = ("s3", "s3n", 's3u', "s3a") +DEFAULT_PORT = 443 +DEFAULT_HOST = 's3.amazonaws.com' DEFAULT_BUFFER_SIZE = 128 * 1024 @@ -55,6 +61,140 @@ WHENCE_CHOICES = [START, CURRENT, END] +def _my_urlsplit(url): + """This is a hack to prevent the regular urlsplit from splitting around question marks. + + A question mark (?) in a URL typically indicates the start of a + querystring, and the standard library's urlparse function handles the + querystring separately. Unfortunately, question marks can also appear + _inside_ the actual URL for some schemas like S3. + + Replaces question marks with newlines prior to splitting. This is safe because: + + 1. The standard library's urlsplit completely ignores newlines + 2. Raw newlines will never occur in innocuous URLs. They are always URL-encoded. + + See Also + -------- + https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py + https://github.com/RaRe-Technologies/smart_open/issues/285 + """ + sr = urlparse.urlsplit(url.replace('?', '\n'), allow_fragments=False) + return urlparse.SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '', '') + + +def parse_uri(uri_as_string): + # + # Restrictions on bucket names and labels: + # + # - Bucket names must be at least 3 and no more than 63 characters long. + # - Bucket names must be a series of one or more labels. + # - Adjacent labels are separated by a single period (.). + # - Bucket names can contain lowercase letters, numbers, and hyphens. + # - Each label must start and end with a lowercase letter or a number. + # + # We use the above as a guide only, and do not perform any validation. We + # let boto3 take care of that for us. + # + split_uri = _my_urlsplit(uri_as_string) + assert split_uri.scheme in SUPPORTED_SCHEMES + + port = DEFAULT_PORT + host = boto.config.get('s3', 'host', DEFAULT_HOST) + ordinary_calling_format = False + # + # These defaults tell boto3 to look for credentials elsewhere + # + access_id, access_secret = None, None + + # + # Common URI template [secret:key@][host[:port]@]bucket/object + # + # The urlparse function doesn't handle the above schema, so we have to do + # it ourselves. + # + uri = split_uri.netloc + split_uri.path + + if '@' in uri and ':' in uri.split('@')[0]: + auth, uri = uri.split('@', 1) + access_id, access_secret = auth.split(':') + + head, key_id = uri.split('/', 1) + if '@' in head and ':' in head: + ordinary_calling_format = True + host_port, bucket_id = head.split('@') + host, port = host_port.split(':', 1) + port = int(port) + elif '@' in head: + ordinary_calling_format = True + host, bucket_id = head.split('@') + else: + bucket_id = head + + return smart_open.uri.Uri( + scheme=split_uri.scheme, + bucket_id=bucket_id, + key_id=key_id, + port=port, + host=host, + ordinary_calling_format=ordinary_calling_format, + access_id=access_id, + access_secret=access_secret, + ) + + +def consolidate_params(uri, transport_params): + """Consolidates the parsed Uri with the additional parameters. + + This is necessary because the user can pass some of the parameters can in + two different ways: + + 1) Via the URI itself + 2) Via the transport parameters + + These are not mutually exclusive, but we have to pick one over the other + in a sensible way in order to proceed. + + """ + transport_params = dict(transport_params) + + session = transport_params.get('session') + if session is not None and (uri.access_id or uri.access_secret): + logger.warning( + 'ignoring credentials parsed from URL because they conflict with ' + 'transport_params.session. Set transport_params.session to None ' + 'to suppress this warning.' + ) + uri = uri._replace(access_id=None, access_secret=None) + elif (uri.access_id and uri.access_secret): + transport_params['session'] = boto3.Session( + aws_access_key_id=uri.access_id, + aws_secret_access_key=uri.access_secret, + ) + uri = uri._replace(access_id=None, access_secret=None) + + if uri.host != DEFAULT_HOST: + endpoint_url = 'https://%s:%d' % (uri.host, uri.port) + _override_endpoint_url(transport_params, endpoint_url) + + return uri, transport_params + + +def _override_endpoint_url(transport_params, url): + try: + resource_kwargs = transport_params['resource_kwargs'] + except KeyError: + resource_kwargs = transport_params['resource_kwargs'] = {} + + if resource_kwargs.get('endpoint_url'): + logger.warning( + 'ignoring endpoint_url parsed from URL because it conflicts ' + 'with transport_params.resource_kwargs.endpoint_url. ' + ) + else: + resource_kwargs.update(endpoint_url=url) + + def clamp(value, minval, maxval): return max(min(value, maxval), minval) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 743f11de..5bdfce18 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -26,7 +26,6 @@ import warnings import sys -import boto import boto3 import six @@ -42,6 +41,11 @@ import smart_open.http as smart_open_http import smart_open.ssh as smart_open_ssh +from smart_open.uri import Uri + +from smart_open import compression +from smart_open.compression import register_compressor + from smart_open import doctools # Import ``pathlib`` if the builtin ``pathlib`` or the backport ``pathlib2`` are @@ -60,89 +64,122 @@ _ISSUE_189_URL = 'https://github.com/RaRe-Technologies/smart_open/issues/189' -_DEFAULT_S3_HOST = 's3.amazonaws.com' -_COMPRESSOR_REGISTRY = {} +NO_SCHEME = '' +FILE_SCHEME = 'file' + + +def _parse_uri_file(uri_as_string): + if uri_as_string.startswith('file://'): + local_path = uri_as_string.replace('file://', '', 1) + else: + local_path = uri_as_string + + local_path = os.path.expanduser(local_path) + return Uri(scheme=FILE_SCHEME, uri_path=local_path) + + +def _generate_parsers(): + yield NO_SCHEME, _parse_uri_file + yield FILE_SCHEME, _parse_uri_file + yield smart_open_hdfs.HDFS_SCHEME, smart_open_hdfs.parse_uri + yield ( + smart_open_webhdfs.WEBHDFS_SCHEME, + lambda x: Uri(scheme=smart_open_webhdfs.WEBHDFS_SCHEME, uri_path=x), + ) + for scheme in smart_open_s3.SUPPORTED_SCHEMES: + yield scheme, smart_open_s3.parse_uri + for scheme in smart_open_ssh.SUPPORTED_SCHEMES: + yield scheme, smart_open_ssh.parse_uri + for scheme in smart_open_http.SUPPORTED_SCHEMES: + yield scheme, smart_open_http.parse_uri + +# +# A mapping of schemes (e.g. hdfs, s3) to functions that parse URLs of that shcheme. +# Each function should accept a single argument: the URL as a string. +# +_PARSERS = dict(_generate_parsers()) -def register_compressor(ext, callback): - """Register a callback for transparently decompressing files with a specific extension. +SUPPORTED_SCHEMES = tuple(sorted(_PARSERS.keys())) +"""The transport schemes that ``smart_open`` supports.""" + + +def _sniff_scheme(url_as_string): + """Returns the scheme of the URL only, as a string.""" + # + # urlsplit doesn't work on Windows -- it parses the drive as the scheme... + # no protocol given => assume a local file + # + if os.name == 'nt' and '://' not in uri_as_string: + uri_as_string = 'file://' + uri_as_string + + return urlparse.urlsplit(url_as_string).scheme + + +def parse_uri(uri_as_string): + """ + Parse the given URI from a string. Parameters ---------- - ext: str - The extension. - callback: callable - The callback. It must accept two position arguments, file_obj and mode. + uri_as_string: str + The URI to parse. - Examples - -------- + Returns + ------- + smart_open.uri.Uri + The parsed URI. - Instruct smart_open to use the identity function whenever opening a file - with a .xz extension (see README.rst for the complete example showing I/O): + Notes + ----- - >>> def _handle_xz(file_obj, mode): - ... import lzma - ... return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) - >>> - >>> register_compressor('.xz', _handle_xz) + Supported URI schemes are: - """ - if not (ext and ext[0] == '.'): - raise ValueError('ext must be a string starting with ., not %r' % ext) - if ext in _COMPRESSOR_REGISTRY: - logger.warning('overriding existing compression handler for %r', ext) - _COMPRESSOR_REGISTRY[ext] = callback + * file + * hdfs + * http + * https + * s3 + * s3a + * s3n + * s3u + * webhdfs + .s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. -def _handle_bz2(file_obj, mode): - if six.PY2: - from bz2file import BZ2File - else: - from bz2 import BZ2File - return BZ2File(file_obj, mode) + Valid URI examples:: + * s3://my_bucket/my_key + * s3://my_key:my_secret@my_bucket/my_key + * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key + * hdfs:///path/file + * hdfs://path/file + * webhdfs://host:port/path/file + * ./local/path/file + * ~/local/path/file + * local/path/file + * ./local/path/file.gz + * file:///home/user/file + * file:///home/user/file.bz2 + * [ssh|scp|sftp]://username@host//path/file + * [ssh|scp|sftp]://username@host/path/file + + """ + scheme = _sniff_scheme(uri_as_string) -def _handle_gzip(file_obj, mode): - import gzip - return gzip.GzipFile(fileobj=file_obj, mode=mode) + try: + parser = _PARSERS[scheme] + except KeyError: + raise NotImplementedError("unknown URI scheme %r in %r" % (scheme, uri_as_string)) + return parser(uri_as_string) -# -# NB. avoid using lambda here to make stack traces more readable. -# -register_compressor('.bz2', _handle_bz2) -register_compressor('.gz', _handle_gzip) - - -Uri = collections.namedtuple( - 'Uri', - ( - 'scheme', - 'uri_path', - 'bucket_id', - 'key_id', - 'port', - 'host', - 'ordinary_calling_format', - 'access_id', - 'access_secret', - 'user', - 'password', - ) -) -"""Represents all the options that we parse from user input. -Some of the above options only make sense for certain protocols, e.g. -bucket_id is only for S3. -""" -# -# Set the default values for all Uri fields to be None. This allows us to only -# specify the relevant fields when constructing a Uri. # -# https://stackoverflow.com/questions/11351032/namedtuple-and-default-values-for-optional-keyword-arguments +# To keep old unit tests happy while I'm refactoring. # -Uri.__new__.__defaults__ = (None,) * len(Uri._fields) +_parse_uri = parse_uri def _inspect_kwargs(kallable): @@ -353,7 +390,7 @@ def open( if ignore_ext: decompressed = binary else: - decompressed = _compression_wrapper(binary, filename, mode) + decompressed = compression.compression_wrapper(binary, filename, mode) if 'b' not in mode or explicit_encoding is not None: decoded = _encoding_wrapper(decompressed, mode, encoding=encoding, errors=errors) @@ -490,12 +527,12 @@ def _shortcut_open( if not isinstance(uri, six.string_types): return None - parsed_uri = _parse_uri(uri) - if parsed_uri.scheme != 'file': + parsed_uri = parse_uri(uri) + if parsed_uri.scheme != FILE_SCHEME: return None _, extension = P.splitext(parsed_uri.uri_path) - if extension in _COMPRESSOR_REGISTRY and not ignore_ext: + if extension in compression.get_supported_extensions() and not ignore_ext: return None open_kwargs = {} @@ -541,337 +578,70 @@ def _open_binary_stream(uri, mode, transport_params): # raise NotImplementedError('unsupported mode: %r' % mode) - if isinstance(uri, six.string_types): - # this method just routes the request to classes handling the specific storage - # schemes, depending on the URI protocol in `uri` - filename = uri.split('/')[-1] - parsed_uri = _parse_uri(uri) - - if parsed_uri.scheme == "file": - fobj = io.open(parsed_uri.uri_path, mode) - return fobj, filename - elif parsed_uri.scheme in smart_open_ssh.SCHEMES: - fobj = smart_open_ssh.open( - parsed_uri.uri_path, - mode, - host=parsed_uri.host, - user=parsed_uri.user, - port=parsed_uri.port, - password=parsed_uri.password, - transport_params=transport_params, - ) - return fobj, filename - elif parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES: - return _s3_open_uri(parsed_uri, mode, transport_params), filename - elif parsed_uri.scheme == "hdfs": - _check_kwargs(smart_open_hdfs.open, transport_params) - return smart_open_hdfs.open(parsed_uri.uri_path, mode), filename - elif parsed_uri.scheme == "webhdfs": - kw = _check_kwargs(smart_open_webhdfs.open, transport_params) - http_uri = smart_open_webhdfs.convert_to_http_uri(parsed_uri) - return smart_open_webhdfs.open(http_uri, mode, **kw), filename - elif parsed_uri.scheme.startswith('http'): - # - # The URI may contain a query string and fragments, which interfere - # with our compressed/uncompressed estimation, so we strip them. - # - filename = P.basename(urlparse.urlparse(uri).path) - kw = _check_kwargs(smart_open_http.open, transport_params) - return smart_open_http.open(uri, mode, **kw), filename - else: - raise NotImplementedError("scheme %r is not supported", parsed_uri.scheme) - elif hasattr(uri, 'read'): + if hasattr(uri, 'read'): # simply pass-through if already a file-like # we need to return something as the file name, but we don't know what # so we probe for uri.name (e.g., this works with open() or tempfile.NamedTemporaryFile) - # if the value ends with COMPRESSED_EXT, we will note it in _compression_wrapper() + # if the value ends with COMPRESSED_EXT, we will note it in compression_wrapper() # if there is no such an attribute, we return "unknown" - this # effectively disables any compression filename = getattr(uri, 'name', 'unknown') return uri, filename - else: - raise TypeError("don't know how to handle uri %r" % uri) - - -def _s3_open_uri(uri, mode, transport_params): - logger.debug('s3_open_uri: %r', locals()) - if mode in ('r', 'w'): - raise ValueError('this function can only open binary streams. ' - 'Use smart_open.smart_open() to open text streams.') - elif mode not in ('rb', 'wb'): - raise NotImplementedError('unsupported mode: %r', mode) - - # - # There are two explicit ways we can receive session parameters from the user. - # - # 1. Via the session keyword argument (transport_params) - # 2. Via the URI itself - # - # They are not mutually exclusive, but we have to pick one of the two. - # Go with 1). - # - if transport_params.get('session') is not None and (uri.access_id or uri.access_secret): - logger.warning( - 'ignoring credentials parsed from URL because they conflict with ' - 'transport_params.session. Set transport_params.session to None ' - 'to suppress this warning.' - ) - elif (uri.access_id and uri.access_secret): - transport_params['session'] = boto3.Session( - aws_access_key_id=uri.access_id, - aws_secret_access_key=uri.access_secret, - ) - - # - # There are two explicit ways the user can provide the endpoint URI: - # - # 1. Via the URL. The protocol is implicit, and we assume HTTPS in this case. - # 2. Via the resource_kwargs and multipart_upload_kwargs endpoint_url parameter. - # - # Again, these are not mutually exclusive: the user can specify both. We - # have to pick one to proceed, however, and we go with 2. - # - if uri.host != _DEFAULT_S3_HOST: - endpoint_url = 'https://%s:%d' % (uri.host, uri.port) - _override_endpoint_url(transport_params, endpoint_url) - - kwargs = _check_kwargs(smart_open_s3.open, transport_params) - return smart_open_s3.open(uri.bucket_id, uri.key_id, mode, **kwargs) - - -def _override_endpoint_url(tp, url): - try: - resource_kwargs = tp['resource_kwargs'] - except KeyError: - resource_kwargs = tp['resource_kwargs'] = {} - if resource_kwargs.get('endpoint_url'): - logger.warning( - 'ignoring endpoint_url parsed from URL because it conflicts ' - 'with transport_params.resource_kwargs.endpoint_url. ' - ) - else: - resource_kwargs.update(endpoint_url=url) - - -def _my_urlsplit(url): - """This is a hack to prevent the regular urlsplit from splitting around question marks. - - A question mark (?) in a URL typically indicates the start of a - querystring, and the standard library's urlparse function handles the - querystring separately. Unfortunately, question marks can also appear - _inside_ the actual URL for some schemas like S3. - - Replaces question marks with newlines prior to splitting. This is safe because: - - 1. The standard library's urlsplit completely ignores newlines - 2. Raw newlines will never occur in innocuous URLs. They are always URL-encoded. - - See Also - -------- - https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py - https://github.com/RaRe-Technologies/smart_open/issues/285 - """ - parsed_url = urlparse.urlsplit(url, allow_fragments=False) - if parsed_url.scheme not in smart_open_s3.SUPPORTED_SCHEMES or '?' not in url: - return parsed_url - - sr = urlparse.urlsplit(url.replace('?', '\n'), allow_fragments=False) - return urlparse.SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '', '') - - -def _parse_uri(uri_as_string): - """ - Parse the given URI from a string. - - Supported URI schemes are: - - * file - * hdfs - * http - * https - * s3 - * s3a - * s3n - * s3u - * webhdfs - - .s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. - - Valid URI examples:: + if not isinstance(uri, six.string_types): + raise TypeError("don't know how to handle uri %r" % uri) - * s3://my_bucket/my_key - * s3://my_key:my_secret@my_bucket/my_key - * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key - * hdfs:///path/file - * hdfs://path/file - * webhdfs://host:port/path/file - * ./local/path/file - * ~/local/path/file - * local/path/file - * ./local/path/file.gz - * file:///home/user/file - * file:///home/user/file.bz2 - * [ssh|scp|sftp]://username@host//path/file - * [ssh|scp|sftp]://username@host/path/file + filename = uri.split('/')[-1] + parsed_uri = parse_uri(uri) - """ - if os.name == 'nt': - # urlsplit doesn't work on Windows -- it parses the drive as the scheme... - if '://' not in uri_as_string: - # no protocol given => assume a local file - uri_as_string = 'file://' + uri_as_string - - parsed_uri = _my_urlsplit(uri_as_string) - - if parsed_uri.scheme == "hdfs": - return _parse_uri_hdfs(parsed_uri) - elif parsed_uri.scheme == "webhdfs": - return parsed_uri - elif parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES: - return _parse_uri_s3x(parsed_uri) - elif parsed_uri.scheme == 'file': - return _parse_uri_file(parsed_uri.netloc + parsed_uri.path) - elif parsed_uri.scheme in ('', None): - return _parse_uri_file(uri_as_string) - elif parsed_uri.scheme.startswith('http'): - return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string) - elif parsed_uri.scheme in smart_open_ssh.SCHEMES: - return _parse_uri_ssh(parsed_uri) - else: - raise NotImplementedError( - "unknown URI scheme %r in %r" % (parsed_uri.scheme, uri_as_string) + bad_scheme = NotImplementedError( + "scheme %r is not supported, expected one of %r" % ( + parsed_uri.scheme, SUPPORTED_SCHEMES, ) - - -def _parse_uri_hdfs(parsed_uri): - assert parsed_uri.scheme == 'hdfs' - uri_path = parsed_uri.netloc + parsed_uri.path - uri_path = "/" + uri_path.lstrip("/") - if not uri_path: - raise RuntimeError("invalid HDFS URI: %s" % str(parsed_uri)) - - return Uri(scheme='hdfs', uri_path=uri_path) - - -def _parse_uri_s3x(parsed_uri): - # - # Restrictions on bucket names and labels: - # - # - Bucket names must be at least 3 and no more than 63 characters long. - # - Bucket names must be a series of one or more labels. - # - Adjacent labels are separated by a single period (.). - # - Bucket names can contain lowercase letters, numbers, and hyphens. - # - Each label must start and end with a lowercase letter or a number. - # - # We use the above as a guide only, and do not perform any validation. We - # let boto3 take care of that for us. - # - assert parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES - - port = 443 - host = boto.config.get('s3', 'host', _DEFAULT_S3_HOST) - ordinary_calling_format = False - # - # These defaults tell boto3 to look for credentials elsewhere - # - access_id, access_secret = None, None - - # - # Common URI template [secret:key@][host[:port]@]bucket/object - # - # The urlparse function doesn't handle the above schema, so we have to do - # it ourselves. - # - uri = parsed_uri.netloc + parsed_uri.path - - if '@' in uri and ':' in uri.split('@')[0]: - auth, uri = uri.split('@', 1) - access_id, access_secret = auth.split(':') - - head, key_id = uri.split('/', 1) - if '@' in head and ':' in head: - ordinary_calling_format = True - host_port, bucket_id = head.split('@') - host, port = host_port.split(':', 1) - port = int(port) - elif '@' in head: - ordinary_calling_format = True - host, bucket_id = head.split('@') - else: - bucket_id = head - - return Uri( - scheme=parsed_uri.scheme, bucket_id=bucket_id, key_id=key_id, - port=port, host=host, ordinary_calling_format=ordinary_calling_format, - access_id=access_id, access_secret=access_secret - ) - - -def _parse_uri_file(input_path): - # '~/tmp' may be expanded to '/Users/username/tmp' - uri_path = os.path.expanduser(input_path) - - if not uri_path: - raise RuntimeError("invalid file URI: %s" % input_path) - - return Uri(scheme='file', uri_path=uri_path) - - -def _parse_uri_ssh(unt): - """Parse a Uri from a urllib namedtuple.""" - return Uri( - scheme=unt.scheme, - uri_path=_unquote(unt.path), - user=_unquote(unt.username), - host=unt.hostname, - port=int(unt.port or smart_open_ssh.DEFAULT_PORT), - password=_unquote(unt.password), ) + if parsed_uri.scheme not in SUPPORTED_SCHEMES: + raise bad_scheme + + if parsed_uri.scheme == FILE_SCHEME: + fobj = io.open(parsed_uri.uri_path, mode) + return fobj, filename + + if parsed_uri.scheme in smart_open_ssh.SUPPORTED_SCHEMES: + fobj = smart_open_ssh.open( + parsed_uri.uri_path, + mode, + host=parsed_uri.host, + user=parsed_uri.user, + port=parsed_uri.port, + password=parsed_uri.password, + transport_params=transport_params, + ) + return fobj, filename + if parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES: + parsed_uri, transport_params = smart_open_s3.consolidate_params(parsed_uri, transport_params) + kw = _check_kwargs(smart_open_s3.open, transport_params) + fobj = smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode, **kw) + return fobj, filename -def _unquote(text): - return text and urlparse.unquote(text) + if parsed_uri.scheme == smart_open_hdfs.HDFS_SCHEME: + _check_kwargs(smart_open_hdfs.open, transport_params) + return smart_open_hdfs.open(parsed_uri.uri_path, mode), filename + if parsed_uri.scheme == smart_open_webhdfs.WEBHDFS_SCHEME: + kw = _check_kwargs(smart_open_webhdfs.open, transport_params) + return smart_open_webhdfs.open(uri, mode, **kw), filename -def _need_to_buffer(file_obj, mode, ext): - """Returns True if we need to buffer the whole file in memory in order to proceed.""" - try: - is_seekable = file_obj.seekable() - except AttributeError: + if parsed_uri.scheme in smart_open_http.SUPPORTED_SCHEMES: # - # Under Py2, built-in file objects returned by open do not have - # .seekable, but have a .seek method instead. + # The URI may contain a query string and fragments, which interfere + # with our compressed/uncompressed estimation, so we strip them. # - is_seekable = hasattr(file_obj, 'seek') - return six.PY2 and mode.startswith('r') and ext in _COMPRESSOR_REGISTRY and not is_seekable - - -def _compression_wrapper(file_obj, filename, mode): - """ - This function will wrap the file_obj with an appropriate - [de]compression mechanism based on the extension of the filename. - - file_obj must either be a filehandle object, or a class which behaves - like one. + filename = P.basename(urlparse.urlparse(uri).path) + kw = _check_kwargs(smart_open_http.open, transport_params) + return smart_open_http.open(uri, mode, **kw), filename - If the filename extension isn't recognized, will simply return the original - file_obj. - """ - _, ext = os.path.splitext(filename) - - if _need_to_buffer(file_obj, mode, ext): - warnings.warn('streaming gzip support unavailable, see %s' % _ISSUE_189_URL) - file_obj = io.BytesIO(file_obj.read()) - if ext in _COMPRESSOR_REGISTRY and mode.endswith('+'): - raise ValueError('transparent (de)compression unsupported for mode %r' % mode) - - try: - callback = _COMPRESSOR_REGISTRY[ext] - except KeyError: - return file_obj - else: - return callback(file_obj, mode) + raise bad_scheme def _encoding_wrapper(fileobj, mode, encoding=None, errors=None): diff --git a/smart_open/ssh.py b/smart_open/ssh.py index 6290f839..f065d788 100644 --- a/smart_open/ssh.py +++ b/smart_open/ssh.py @@ -26,6 +26,10 @@ import logging import warnings +from six.moves.urllib import parse as urlparse + +import smart_open.uri + logger = logging.getLogger(__name__) # @@ -33,12 +37,29 @@ # _SSH = {} -SCHEMES = ("ssh", "scp", "sftp") +SUPPORTED_SCHEMES = ("ssh", "scp", "sftp") """Supported URL schemes.""" DEFAULT_PORT = 22 +def _unquote(text): + return text and urlparse.unquote(text) + + +def parse_uri(uri_as_string): + split_uri = urlparse.urlsplit(uri_as_string) + assert split_uri.scheme in SUPPORTED_SCHEMES + return smart_open.uri.Uri( + scheme=split_uri.scheme, + uri_path=_unquote(split_uri.path), + user=_unquote(split_uri.username), + host=split_uri.hostname, + port=int(split_uri.port or DEFAULT_PORT), + password=_unquote(split_uri.password), + ) + + def _connect(hostname, username, port, password, transport_params): try: import paramiko diff --git a/smart_open/uri.py b/smart_open/uri.py new file mode 100644 index 00000000..0f558146 --- /dev/null +++ b/smart_open/uri.py @@ -0,0 +1,31 @@ +import collections + + +Uri = collections.namedtuple( + 'Uri', + ( + 'scheme', + 'uri_path', + 'bucket_id', + 'key_id', + 'port', + 'host', + 'ordinary_calling_format', + 'access_id', + 'access_secret', + 'user', + 'password', + ) +) +"""Represents all the options that we parse from user input. + +Some of the above options only make sense for certain protocols, e.g. +bucket_id is only for S3. +""" +# +# Set the default values for all Uri fields to be None. This allows us to only +# specify the relevant fields when constructing a Uri. +# +# https://stackoverflow.com/questions/11351032/namedtuple-and-default-values-for-optional-keyword-arguments +# +Uri.__new__.__defaults__ = (None,) * len(Uri._fields) diff --git a/smart_open/webhdfs.py b/smart_open/webhdfs.py index 5ac35c61..1fcfb992 100644 --- a/smart_open/webhdfs.py +++ b/smart_open/webhdfs.py @@ -26,6 +26,8 @@ logger = logging.getLogger(__name__) +WEBHDFS_SCHEME = 'webhdfs' + WEBHDFS_MIN_PART_SIZE = 50 * 1024**2 # minimum part size for HDFS multipart uploads @@ -39,6 +41,9 @@ def open(http_uri, mode, min_part_size=WEBHDFS_MIN_PART_SIZE): For writing only. """ + if http_uri.startswith(WEBHDFS_SCHEME): + http_uri = _convert_to_http_uri(http_uri) + if mode == 'rb': return BufferedInputBase(http_uri) elif mode == 'wb': @@ -47,29 +52,37 @@ def open(http_uri, mode, min_part_size=WEBHDFS_MIN_PART_SIZE): raise NotImplementedError("webhdfs support for mode %r not implemented" % mode) -def convert_to_http_uri(parsed_uri): +def _convert_to_http_uri(webhdfs_url): """ Convert webhdfs uri to http url and return it as text Parameters ---------- - parsed_uri: str - result of urlsplit of webhdfs url + webhdfs_url: str + A URL starting with webhdfs:// """ - netloc = parsed_uri.hostname - if parsed_uri.port: - netloc += ":{}".format(parsed_uri.port) - query = parsed_uri.query - if parsed_uri.username: + split_uri = urlparse.urlsplit(webhdfs_url) + netloc = split_uri.hostname + if split_uri.port: + netloc += ":{}".format(split_uri.port) + query = split_uri.query + if split_uri.username: query += ( - ("&" if query else "") + "user.name=" + urlparse.quote(parsed_uri.username) + ("&" if query else "") + "user.name=" + urlparse.quote(split_uri.username) ) return urlparse.urlunsplit( - ("http", netloc, "/webhdfs/v1" + parsed_uri.path, query, "") + ("http", netloc, "/webhdfs/v1" + split_uri.path, query, "") ) +# +# For old unit tests. +# +def convert_to_http_uri(parsed_uri): + return _convert_to_http_uri(parsed_uri.uri_path) + + class BufferedInputBase(io.BufferedIOBase): def __init__(self, uri): self._uri = uri From 656d2b4ae16419b4934c2684f8d1cb03c1adceb4 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 11 Jan 2020 23:51:44 +0900 Subject: [PATCH 02/32] more refactoring --- extending.md | 69 ++++++++++ smart_open/compression.py | 9 +- smart_open/file.py | 31 +++++ smart_open/hdfs.py | 17 ++- smart_open/http.py | 14 +- smart_open/s3.py | 33 +++-- smart_open/smart_open_lib.py | 258 ++++++++++++----------------------- smart_open/ssh.py | 16 ++- smart_open/uri.py | 31 ----- smart_open/utils.py | 53 +++++++ smart_open/webhdfs.py | 20 ++- 11 files changed, 326 insertions(+), 225 deletions(-) create mode 100644 extending.md create mode 100644 smart_open/file.py delete mode 100644 smart_open/uri.py create mode 100644 smart_open/utils.py diff --git a/extending.md b/extending.md new file mode 100644 index 00000000..556f4e28 --- /dev/null +++ b/extending.md @@ -0,0 +1,69 @@ +# Extending `smart_open` + +This document targets potential contributors to `smart_open`. +Currently, there are two main directions for extending existing `smart_open` functionality: + +1. Add a new transport mechanism +2. Add a new compression format + +## New transport mechanisms + +Each transport mechanism lives in its own submodule. +For example, currently we have: + +- `smart_open.file` +- `smart_open.s3` +- `smart_open.ssh` +- ... and others + +So, to implement a new transport mechanism, you need to create a new module. +Your module should expose the following: + +```python +XXX_SCHEMA = ... +"""The name of the mechanism, e.g. s3, ssh, etc. + +This is the part that goes before the `://` in a URL, e.g. `s3://`.""" + +def parse_uri(uri_as_str): + """Parse the specified URI into a dict. + + At a bare minimum, the dict must have `schema` member. + """ + return dict(schema=XXX_SCHEMA, ...) + + +def open_uri(uri_as_str, mode, transport_params): + """Return a file-like object pointing to the URI.""" + ... +``` + +Have a look at the existing mechanisms to see how they work. +You may define other functions and classes as necessary for your implementation. + +Once your module is working, register it in the `smart_open/smart_open_lib.py` file. +The `_generate_transport()` generator builds a dictionary that maps schemes to the modules that implement functionality for them. +Include your new mechanism in that generator, and `smart_open` will be able to use it. + +## New compression mechanisms + +The compression layer is self-contained in the `smart_open.compression` submodule. + +To add support for a new compressor: + +- Create a new function to handle your compression format (given an extension) +- Add your compressor to the registry + +For example: + +```python +def _handle_xz(file_obj, mode): + import lzma + return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) + +register_compressor('.xz', _handle_xz) +``` + +There are many compression formats out there, and supporting all of them is beyond the scope of `smart_open`. +We want our code's functionality to cover the bare minimum required to satisfy 80% of our users. +We leave the remaining 20% of users with the ability to deal with compression in their own code, using the trivial mechanism described above. diff --git a/smart_open/compression.py b/smart_open/compression.py index 223f87c3..c71fb791 100644 --- a/smart_open/compression.py +++ b/smart_open/compression.py @@ -5,16 +5,23 @@ # This code is distributed under the terms and conditions # from the MIT License (MIT). # +"""Implements the compression layer of the ``smart_open`` library.""" import io +import logging import os.path +import warnings import six +logger = logging.getLogger(__name__) + _COMPRESSOR_REGISTRY = {} +_ISSUE_189_URL = 'https://github.com/RaRe-Technologies/smart_open/issues/189' def get_supported_extensions(): + """Return the list of file extensions for which we have registered compressors.""" return sorted(_COMPRESSOR_REGISTRY.keys()) @@ -31,7 +38,7 @@ def register_compressor(ext, callback): Examples -------- - Instruct smart_open to use the identity function whenever opening a file + Instruct smart_open to use the `lzma` module whenever opening a file with a .xz extension (see README.rst for the complete example showing I/O): >>> def _handle_xz(file_obj, mode): diff --git a/smart_open/file.py b/smart_open/file.py new file mode 100644 index 00000000..d81df43b --- /dev/null +++ b/smart_open/file.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2020 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# +"""Implements the transport for the file:// schema.""" +import io +import os.path + +FILE_SCHEME = 'file' + + +def parse_uri(uri_as_string): + local_path = extract_local_path(uri_as_string) + return dict(scheme=FILE_SCHEME, uri_path=local_path) + + +def open_uri(uri_as_string, mode, transport_params): + parsed_uri = parse_uri(uri_as_string) + fobj = io.open(parsed_uri['uri_path'], mode) + return fobj + + +def extract_local_path(uri_as_string): + if uri_as_string.startswith('file://'): + local_path = uri_as_string.replace('file://', '', 1) + else: + local_path = uri_as_string + return os.path.expanduser(local_path) diff --git a/smart_open/hdfs.py b/smart_open/hdfs.py index 0f3df482..f2adf681 100644 --- a/smart_open/hdfs.py +++ b/smart_open/hdfs.py @@ -18,10 +18,10 @@ import logging import subprocess -import smart_open.uri - from six.moves.urllib import parse as urlparse +from smart_open import utils + logger = logging.getLogger(__name__) HDFS_SCHEME = 'hdfs' @@ -34,9 +34,18 @@ def parse_uri(uri_as_string): uri_path = split_uri.netloc + split_uri.path uri_path = "/" + uri_path.lstrip("/") if not uri_path: - raise RuntimeError("invalid HDFS URI: %s" % str(parsed_uri)) + raise RuntimeError("invalid HDFS URI: %r" % uri_as_string) + + return dict(scheme=HDFS_SCHEME, uri_path=uri_path) + + +def open_uri(uri, mode, transport_params): + utils.check_kwargs(open, transport_params) - return smart_open.uri.Uri(scheme=HDFS_SCHEME, uri_path=uri_path) + parsed_uri = parse_uri(uri) + fobj = open(parsed_uri['uri_path'], mode) + fobj.name = parsed_uri['uri_path'].split('/')[-1] + return fobj def open(uri, mode): diff --git a/smart_open/http.py b/smart_open/http.py index 3546f4d7..de901e79 100644 --- a/smart_open/http.py +++ b/smart_open/http.py @@ -9,12 +9,13 @@ import io import logging +import os.path from six.moves.urllib import parse as urlparse import requests from smart_open import bytebuffer, s3 -import smart_open.uri +import smart_open.utils DEFAULT_BUFFER_SIZE = 128 * 1024 SUPPORTED_SCHEMES = ('http', 'https') @@ -37,7 +38,12 @@ def parse_uri(uri_as_string): uri_path = split_uri.netloc + split_uri.path uri_path = "/" + uri_path.lstrip("/") - return smart_open.uri.Uri(scheme=split_uri.scheme, uri_path=uri_path) + return dict(scheme=split_uri.scheme, uri_path=uri_path) + + +def open_uri(uri, mode, transport_params): + kwargs = smart_open.utils.check_kwargs(open, transport_params) + return open(uri, mode, **kwargs) def open(uri, mode, kerberos=False, user=None, password=None, headers=None): @@ -69,10 +75,12 @@ def open(uri, mode, kerberos=False, user=None, password=None, headers=None): """ if mode == 'rb': - return SeekableBufferedInputBase( + fobj = SeekableBufferedInputBase( uri, mode, kerberos=kerberos, user=user, password=password, headers=headers ) + fobj.name = os.path.basename(urlparse.urlparse(uri).path) + return fobj else: raise NotImplementedError('http support for mode %r not implemented' % mode) diff --git a/smart_open/s3.py b/smart_open/s3.py index 89f26f01..014820e8 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -22,7 +22,7 @@ from botocore.exceptions import IncompleteReadError import smart_open.bytebuffer -import smart_open.uri +import smart_open.utils logger = logging.getLogger(__name__) @@ -131,7 +131,7 @@ def parse_uri(uri_as_string): else: bucket_id = head - return smart_open.uri.Uri( + return dict( scheme=split_uri.scheme, bucket_id=bucket_id, key_id=key_id, @@ -143,14 +143,14 @@ def parse_uri(uri_as_string): ) -def consolidate_params(uri, transport_params): +def _consolidate_params(uri, transport_params): """Consolidates the parsed Uri with the additional parameters. This is necessary because the user can pass some of the parameters can in two different ways: 1) Via the URI itself - 2) Via the transport parameters + 2) Via the transport parameters These are not mutually exclusive, but we have to pick one over the other in a sensible way in order to proceed. @@ -159,22 +159,22 @@ def consolidate_params(uri, transport_params): transport_params = dict(transport_params) session = transport_params.get('session') - if session is not None and (uri.access_id or uri.access_secret): + if session is not None and (uri['access_id'] or uri['access_secret']): logger.warning( 'ignoring credentials parsed from URL because they conflict with ' 'transport_params.session. Set transport_params.session to None ' 'to suppress this warning.' ) - uri = uri._replace(access_id=None, access_secret=None) - elif (uri.access_id and uri.access_secret): + uri.update(access_id=None, access_secret=None) + elif (uri['access_id'] and uri['access_secret']): transport_params['session'] = boto3.Session( - aws_access_key_id=uri.access_id, - aws_secret_access_key=uri.access_secret, + aws_access_key_id=uri['access_id'], + aws_secret_access_key=uri['access_secret'], ) - uri = uri._replace(access_id=None, access_secret=None) + uri.update(access_id=None, access_secret=None) - if uri.host != DEFAULT_HOST: - endpoint_url = 'https://%s:%d' % (uri.host, uri.port) + if uri['host'] != DEFAULT_HOST: + endpoint_url = 'https://%(host)s:%(port)d' % uri _override_endpoint_url(transport_params, endpoint_url) return uri, transport_params @@ -208,6 +208,13 @@ def make_range_string(start, stop=None): return 'bytes=%d-%d' % (start, stop) +def open_uri(uri, mode, transport_params): + parsed_uri = parse_uri(uri) + parsed_uri, transport_params = _consolidate_params(parsed_uri, transport_params) + kwargs = smart_open.utils.check_kwargs(open, transport_params) + return open(parsed_uri['bucket_id'], parsed_uri['key_id'], mode, **kwargs) + + def open( bucket_id, key_id, @@ -277,6 +284,8 @@ def open( ) else: assert False, 'unexpected mode: %r' % mode + + fileobj.name = key_id return fileobj diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 5bdfce18..8d462e25 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -10,8 +10,9 @@ The main functions are: - * `open()` - * `register_compressor()` + * ``parse_uri()`` + * ``open()`` + * ``register_compressor()`` """ @@ -20,7 +21,6 @@ import logging import io import importlib -import inspect import os import os.path as P import warnings @@ -35,18 +35,23 @@ # This module defines a function called smart_open so we cannot use # smart_open.submodule to reference to the submodules. # -import smart_open.s3 as smart_open_s3 -import smart_open.hdfs as smart_open_hdfs -import smart_open.webhdfs as smart_open_webhdfs -import smart_open.http as smart_open_http -import smart_open.ssh as smart_open_ssh - -from smart_open.uri import Uri +import smart_open.file as so_file +import smart_open.s3 as so_s3 +import smart_open.hdfs as so_hdfs +import smart_open.webhdfs as so_webhdfs +import smart_open.http as so_http +import smart_open.ssh as so_ssh from smart_open import compression -from smart_open.compression import register_compressor - from smart_open import doctools +from smart_open import utils + +# +# For backwards compatibility and keeping old unit tests happy. +# +from smart_open.compression import register_compressor # noqa: F401 +from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401 +from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401 # Import ``pathlib`` if the builtin ``pathlib`` or the backport ``pathlib2`` are # available. The builtin ``pathlib`` will be imported with higher precedence. @@ -62,50 +67,38 @@ SYSTEM_ENCODING = sys.getdefaultencoding() -_ISSUE_189_URL = 'https://github.com/RaRe-Technologies/smart_open/issues/189' - - NO_SCHEME = '' -FILE_SCHEME = 'file' +_TO_BINARY_LUT = { + 'r': 'rb', 'r+': 'rb+', 'rt': 'rb', 'rt+': 'rb+', + 'w': 'wb', 'w+': 'wb+', 'wt': 'wb', "wt+": 'wb+', + 'a': 'ab', 'a+': 'ab+', 'at': 'ab', 'at+': 'ab+', +} -def _parse_uri_file(uri_as_string): - if uri_as_string.startswith('file://'): - local_path = uri_as_string.replace('file://', '', 1) - else: - local_path = uri_as_string - - local_path = os.path.expanduser(local_path) - return Uri(scheme=FILE_SCHEME, uri_path=local_path) +def _generate_transport(): + yield NO_SCHEME, so_file + yield so_file.FILE_SCHEME, so_file + yield so_hdfs.HDFS_SCHEME, so_hdfs + yield so_webhdfs.WEBHDFS_SCHEME, so_webhdfs + for scheme in so_s3.SUPPORTED_SCHEMES: + yield scheme, so_s3 + for scheme in so_ssh.SUPPORTED_SCHEMES: + yield scheme, so_ssh + for scheme in so_http.SUPPORTED_SCHEMES: + yield scheme, so_http -def _generate_parsers(): - yield NO_SCHEME, _parse_uri_file - yield FILE_SCHEME, _parse_uri_file - yield smart_open_hdfs.HDFS_SCHEME, smart_open_hdfs.parse_uri - yield ( - smart_open_webhdfs.WEBHDFS_SCHEME, - lambda x: Uri(scheme=smart_open_webhdfs.WEBHDFS_SCHEME, uri_path=x), - ) - for scheme in smart_open_s3.SUPPORTED_SCHEMES: - yield scheme, smart_open_s3.parse_uri - for scheme in smart_open_ssh.SUPPORTED_SCHEMES: - yield scheme, smart_open_ssh.parse_uri - for scheme in smart_open_http.SUPPORTED_SCHEMES: - yield scheme, smart_open_http.parse_uri +_TRANSPORT = dict(_generate_transport()) +for schema, transport in _TRANSPORT.items(): + assert hasattr(transport, 'open_uri'), '%r is missing open_uri' % schema + assert hasattr(transport, 'parse_uri'), '%r is missing parse_uri' % schema -# -# A mapping of schemes (e.g. hdfs, s3) to functions that parse URLs of that shcheme. -# Each function should accept a single argument: the URL as a string. -# -_PARSERS = dict(_generate_parsers()) - -SUPPORTED_SCHEMES = tuple(sorted(_PARSERS.keys())) +SUPPORTED_SCHEMES = tuple(sorted(_TRANSPORT.keys())) """The transport schemes that ``smart_open`` supports.""" -def _sniff_scheme(url_as_string): +def _sniff_scheme(uri_as_string): """Returns the scheme of the URL only, as a string.""" # # urlsplit doesn't work on Windows -- it parses the drive as the scheme... @@ -114,7 +107,7 @@ def _sniff_scheme(url_as_string): if os.name == 'nt' and '://' not in uri_as_string: uri_as_string = 'file://' + uri_as_string - return urlparse.urlsplit(url_as_string).scheme + return urlparse.urlsplit(uri_as_string).scheme def parse_uri(uri_as_string): @@ -128,7 +121,7 @@ def parse_uri(uri_as_string): Returns ------- - smart_open.uri.Uri + collections.namedtuple The parsed URI. Notes @@ -169,67 +162,29 @@ def parse_uri(uri_as_string): scheme = _sniff_scheme(uri_as_string) try: - parser = _PARSERS[scheme] + transport = _TRANSPORT[scheme] except KeyError: raise NotImplementedError("unknown URI scheme %r in %r" % (scheme, uri_as_string)) - return parser(uri_as_string) - - -# -# To keep old unit tests happy while I'm refactoring. -# -_parse_uri = parse_uri - - -def _inspect_kwargs(kallable): - # - # inspect.getargspec got deprecated in Py3.4, and calling it spews - # deprecation warnings that we'd prefer to avoid. Unfortunately, older - # versions of Python (<3.3) did not have inspect.signature, so we need to - # handle them the old-fashioned getargspec way. - # try: - signature = inspect.signature(kallable) + parse_uri = getattr(transport, 'parse_uri') except AttributeError: - args, varargs, keywords, defaults = inspect.getargspec(kallable) - if not defaults: - return {} - supported_keywords = args[-len(defaults):] - return dict(zip(supported_keywords, defaults)) - else: - return { - name: param.default - for name, param in signature.parameters.items() - if param.default != inspect.Parameter.empty - } - + raise NotImplementedError('%r transport does not implement parse_uri', scheme) -def _check_kwargs(kallable, kwargs): - """Check which keyword arguments the callable supports. + as_dict = parse_uri(uri_as_string) - Parameters - ---------- - kallable: callable - A function or method to test - kwargs: dict - The keyword arguments to check. If the callable doesn't support any - of these, a warning message will get printed. - - Returns - ------- - dict - A dictionary of argument names and values supported by the callable. - """ - supported_keywords = sorted(_inspect_kwargs(kallable)) - unsupported_keywords = [k for k in sorted(kwargs) if k not in supported_keywords] - supported_kwargs = {k: v for (k, v) in kwargs.items() if k in supported_keywords} - - if unsupported_keywords: - logger.warning('ignoring unsupported keyword arguments: %r', unsupported_keywords) + # + # The conversion to a namedtuple is just to keep the old tests happy while + # I'm still refactoring. + # + Uri = collections.namedtuple('Uri', sorted(as_dict.keys())) + return Uri(**as_dict) - return supported_kwargs +# +# To keep old unit tests happy while I'm refactoring. +# +_parse_uri = parse_uri _builtin_open = open @@ -377,20 +332,12 @@ def open( # filename ---------------> bytes -------------> bytes ---------> text # binary decompressed decode # - try: - binary_mode = {'r': 'rb', 'r+': 'rb+', - 'rt': 'rb', 'rt+': 'rb+', - 'w': 'wb', 'w+': 'wb+', - 'wt': 'wb', "wt+": 'wb+', - 'a': 'ab', 'a+': 'ab+', - 'at': 'ab', 'at+': 'ab+'}[mode] - except KeyError: - binary_mode = mode - binary, filename = _open_binary_stream(uri, binary_mode, transport_params) + binary_mode = _TO_BINARY_LUT.get(mode, mode) + binary = _open_binary_stream(uri, binary_mode, transport_params) if ignore_ext: decompressed = binary else: - decompressed = compression.compression_wrapper(binary, filename, mode) + decompressed = compression.compression_wrapper(binary, binary.name, mode) if 'b' not in mode or explicit_encoding is not None: decoded = _encoding_wrapper(decompressed, mode, encoding=encoding, errors=errors) @@ -405,19 +352,19 @@ def open( # open.__doc__ = None if open.__doc__ is None else open.__doc__ % { 's3': doctools.to_docstring( - doctools.extract_kwargs(smart_open_s3.open.__doc__), + doctools.extract_kwargs(so_s3.open.__doc__), lpad=u' ', ), 'http': doctools.to_docstring( - doctools.extract_kwargs(smart_open_http.open.__doc__), + doctools.extract_kwargs(so_http.open.__doc__), lpad=u' ', ), 'webhdfs': doctools.to_docstring( - doctools.extract_kwargs(smart_open_webhdfs.open.__doc__), + doctools.extract_kwargs(so_webhdfs.open.__doc__), lpad=u' ', ), 'ssh': doctools.to_docstring( - doctools.extract_kwargs(smart_open_ssh.open.__doc__), + doctools.extract_kwargs(so_ssh.open.__doc__), lpad=u' ', ), 'examples': doctools.extract_examples_from_readme_rst(), @@ -447,7 +394,7 @@ def smart_open(uri, mode="rb", **kw): # ignore_extension = kw.pop('ignore_extension', False) - expected_kwargs = _inspect_kwargs(open) + expected_kwargs = utils.inspect_kwargs(open) scrubbed_kwargs = {} transport_params = {} @@ -527,11 +474,12 @@ def _shortcut_open( if not isinstance(uri, six.string_types): return None - parsed_uri = parse_uri(uri) - if parsed_uri.scheme != FILE_SCHEME: + scheme = _sniff_scheme(uri) + if scheme not in (NO_SCHEME, so_file.FILE_SCHEME): return None - _, extension = P.splitext(parsed_uri.uri_path) + local_path = so_file.extract_local_path(uri) + _, extension = P.splitext(local_path) if extension in compression.get_supported_extensions() and not ignore_ext: return None @@ -554,10 +502,10 @@ def _shortcut_open( # kwargs, then we have no option other to use io.open. # if six.PY3: - return _builtin_open(parsed_uri.uri_path, mode, buffering=buffering, **open_kwargs) + return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) elif not open_kwargs: - return _builtin_open(parsed_uri.uri_path, mode, buffering=buffering) - return io.open(parsed_uri.uri_path, mode, buffering=buffering, **open_kwargs) + return _builtin_open(local_path, mode, buffering=buffering) + return io.open(local_path, mode, buffering=buffering, **open_kwargs) def _open_binary_stream(uri, mode, transport_params): @@ -568,8 +516,8 @@ def _open_binary_stream(uri, mode, transport_params): :arg uri: The URI to open. May be a string, or something else. :arg str mode: The mode to open with. Must be rb, wb or ab. :arg transport_params: Keyword argumens for the transport layer. - :returns: A file object and the filename - :rtype: tuple + :returns: A named file object + :rtype: file-like object with a .name attribute """ if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'): # @@ -585,63 +533,37 @@ def _open_binary_stream(uri, mode, transport_params): # if the value ends with COMPRESSED_EXT, we will note it in compression_wrapper() # if there is no such an attribute, we return "unknown" - this # effectively disables any compression - filename = getattr(uri, 'name', 'unknown') - return uri, filename + if not hasattr(uri, 'name'): + uri.name = getattr(uri, 'name', 'unknown') + return uri if not isinstance(uri, six.string_types): raise TypeError("don't know how to handle uri %r" % uri) - filename = uri.split('/')[-1] - parsed_uri = parse_uri(uri) + scheme = _sniff_scheme(uri) bad_scheme = NotImplementedError( "scheme %r is not supported, expected one of %r" % ( - parsed_uri.scheme, SUPPORTED_SCHEMES, + scheme, SUPPORTED_SCHEMES, ) ) - if parsed_uri.scheme not in SUPPORTED_SCHEMES: - raise bad_scheme - - if parsed_uri.scheme == FILE_SCHEME: - fobj = io.open(parsed_uri.uri_path, mode) - return fobj, filename - - if parsed_uri.scheme in smart_open_ssh.SUPPORTED_SCHEMES: - fobj = smart_open_ssh.open( - parsed_uri.uri_path, - mode, - host=parsed_uri.host, - user=parsed_uri.user, - port=parsed_uri.port, - password=parsed_uri.password, - transport_params=transport_params, - ) - return fobj, filename - if parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES: - parsed_uri, transport_params = smart_open_s3.consolidate_params(parsed_uri, transport_params) - kw = _check_kwargs(smart_open_s3.open, transport_params) - fobj = smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode, **kw) - return fobj, filename - - if parsed_uri.scheme == smart_open_hdfs.HDFS_SCHEME: - _check_kwargs(smart_open_hdfs.open, transport_params) - return smart_open_hdfs.open(parsed_uri.uri_path, mode), filename + try: + transport = _TRANSPORT[scheme] + except KeyError: + raise bad_scheme - if parsed_uri.scheme == smart_open_webhdfs.WEBHDFS_SCHEME: - kw = _check_kwargs(smart_open_webhdfs.open, transport_params) - return smart_open_webhdfs.open(uri, mode, **kw), filename + try: + open_uri = getattr(transport, 'open_uri') + except AttributeError: + raise bad_scheme - if parsed_uri.scheme in smart_open_http.SUPPORTED_SCHEMES: - # - # The URI may contain a query string and fragments, which interfere - # with our compressed/uncompressed estimation, so we strip them. - # - filename = P.basename(urlparse.urlparse(uri).path) - kw = _check_kwargs(smart_open_http.open, transport_params) - return smart_open_http.open(uri, mode, **kw), filename + fobj = open_uri(uri, mode, transport_params) + if not hasattr(fobj, 'name'): + logger.critical('TODO') + fobj.name = 'unknown' - raise bad_scheme + return fobj def _encoding_wrapper(fileobj, mode, encoding=None, errors=None): diff --git a/smart_open/ssh.py b/smart_open/ssh.py index f065d788..07ee6103 100644 --- a/smart_open/ssh.py +++ b/smart_open/ssh.py @@ -28,7 +28,7 @@ from six.moves.urllib import parse as urlparse -import smart_open.uri +import smart_open.utils logger = logging.getLogger(__name__) @@ -50,7 +50,7 @@ def _unquote(text): def parse_uri(uri_as_string): split_uri = urlparse.urlsplit(uri_as_string) assert split_uri.scheme in SUPPORTED_SCHEMES - return smart_open.uri.Uri( + return dict( scheme=split_uri.scheme, uri_path=_unquote(split_uri.path), user=_unquote(split_uri.username), @@ -60,6 +60,14 @@ def parse_uri(uri_as_string): ) +def open_uri(uri, mode, transport_params): + smart_open.utils.check_kwargs(open, transport_params) + parsed_uri = parse_uri(uri) + uri_path = parsed_uri.pop('uri_path') + parsed_uri.pop('scheme') + return open(uri_path, mode, transport_params=transport_params, **parsed_uri) + + def _connect(hostname, username, port, password, transport_params): try: import paramiko @@ -127,4 +135,6 @@ def open(path, mode='r', host=None, user=None, password=None, port=DEFAULT_PORT, conn = _connect(host, user, port, password, transport_params) sftp_client = conn.get_transport().open_sftp_client() - return sftp_client.open(path, mode) + fobj = sftp_client.open(path, mode) + fobj.name = path + return fobj diff --git a/smart_open/uri.py b/smart_open/uri.py deleted file mode 100644 index 0f558146..00000000 --- a/smart_open/uri.py +++ /dev/null @@ -1,31 +0,0 @@ -import collections - - -Uri = collections.namedtuple( - 'Uri', - ( - 'scheme', - 'uri_path', - 'bucket_id', - 'key_id', - 'port', - 'host', - 'ordinary_calling_format', - 'access_id', - 'access_secret', - 'user', - 'password', - ) -) -"""Represents all the options that we parse from user input. - -Some of the above options only make sense for certain protocols, e.g. -bucket_id is only for S3. -""" -# -# Set the default values for all Uri fields to be None. This allows us to only -# specify the relevant fields when constructing a Uri. -# -# https://stackoverflow.com/questions/11351032/namedtuple-and-default-values-for-optional-keyword-arguments -# -Uri.__new__.__defaults__ = (None,) * len(Uri._fields) diff --git a/smart_open/utils.py b/smart_open/utils.py new file mode 100644 index 00000000..4fdae33a --- /dev/null +++ b/smart_open/utils.py @@ -0,0 +1,53 @@ +import inspect +import logging + +logger = logging.getLogger(__name__) + + +def inspect_kwargs(kallable): + # + # inspect.getargspec got deprecated in Py3.4, and calling it spews + # deprecation warnings that we'd prefer to avoid. Unfortunately, older + # versions of Python (<3.3) did not have inspect.signature, so we need to + # handle them the old-fashioned getargspec way. + # + try: + signature = inspect.signature(kallable) + except AttributeError: + args, varargs, keywords, defaults = inspect.getargspec(kallable) + if not defaults: + return {} + supported_keywords = args[-len(defaults):] + return dict(zip(supported_keywords, defaults)) + else: + return { + name: param.default + for name, param in signature.parameters.items() + if param.default != inspect.Parameter.empty + } + + +def check_kwargs(kallable, kwargs): + """Check which keyword arguments the callable supports. + + Parameters + ---------- + kallable: callable + A function or method to test + kwargs: dict + The keyword arguments to check. If the callable doesn't support any + of these, a warning message will get printed. + + Returns + ------- + dict + A dictionary of argument names and values supported by the callable. + """ + supported_keywords = sorted(inspect_kwargs(kallable)) + unsupported_keywords = [k for k in sorted(kwargs) if k not in supported_keywords] + supported_kwargs = {k: v for (k, v) in kwargs.items() if k in supported_keywords} + + if unsupported_keywords: + logger.warning('ignoring unsupported keyword arguments: %r', unsupported_keywords) + + return supported_kwargs diff --git a/smart_open/webhdfs.py b/smart_open/webhdfs.py index 1fcfb992..eb66ca97 100644 --- a/smart_open/webhdfs.py +++ b/smart_open/webhdfs.py @@ -19,6 +19,8 @@ import six from six.moves.urllib import parse as urlparse +from smart_open import utils + if six.PY2: import httplib else: @@ -31,6 +33,15 @@ WEBHDFS_MIN_PART_SIZE = 50 * 1024**2 # minimum part size for HDFS multipart uploads +def parse_uri(uri_as_str): + return dict(scheme=WEBHDFS_SCHEME, uri=uri_as_str) + + +def open_uri(uri, mode, transport_params): + kwargs = utils.check_kwargs(open, transport_params) + return open(uri, mode, **kwargs) + + def open(http_uri, mode, min_part_size=WEBHDFS_MIN_PART_SIZE): """ Parameters @@ -45,12 +56,15 @@ def open(http_uri, mode, min_part_size=WEBHDFS_MIN_PART_SIZE): http_uri = _convert_to_http_uri(http_uri) if mode == 'rb': - return BufferedInputBase(http_uri) + fobj = BufferedInputBase(http_uri) elif mode == 'wb': - return BufferedOutputBase(http_uri, min_part_size=min_part_size) + fobj = BufferedOutputBase(http_uri, min_part_size=min_part_size) else: raise NotImplementedError("webhdfs support for mode %r not implemented" % mode) + fobj.name = http_uri.split('/')[-1] + return fobj + def _convert_to_http_uri(webhdfs_url): """ @@ -80,7 +94,7 @@ def _convert_to_http_uri(webhdfs_url): # For old unit tests. # def convert_to_http_uri(parsed_uri): - return _convert_to_http_uri(parsed_uri.uri_path) + return _convert_to_http_uri(parsed_uri.uri) class BufferedInputBase(io.BufferedIOBase): From af35d24e5d70f78f854479b588a726dd6641c751 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 15:47:52 +0900 Subject: [PATCH 03/32] automate docstrings --- README.rst | 4 +- extending.md | 53 ++++++++++++-- smart_open/__init__.py | 12 +++- smart_open/file.py | 13 +++- smart_open/hdfs.py | 11 ++- smart_open/http.py | 4 +- smart_open/s3.py | 10 ++- smart_open/smart_open_lib.py | 132 +++++++++++++++++------------------ smart_open/ssh.py | 7 +- smart_open/webhdfs.py | 16 +++-- 10 files changed, 172 insertions(+), 90 deletions(-) diff --git a/README.rst b/README.rst index be6bd3f0..cc18e300 100644 --- a/README.rst +++ b/README.rst @@ -75,6 +75,8 @@ How? ... break '\n' +.. _doctools_after_examples: + Other examples of URLs that ``smart_open`` accepts:: s3://my_bucket/my_key @@ -93,8 +95,6 @@ Other examples of URLs that ``smart_open`` accepts:: [ssh|scp|sftp]://username@host/path/file [ssh|scp|sftp]://username:password@host/path/file -.. _doctools_after_examples: - Documentation ============= diff --git a/extending.md b/extending.md index 556f4e28..bbe75789 100644 --- a/extending.md +++ b/extending.md @@ -6,6 +6,8 @@ Currently, there are two main directions for extending existing `smart_open` fun 1. Add a new transport mechanism 2. Add a new compression format +The first is by far the more challenging, and also the more welcome. + ## New transport mechanisms Each transport mechanism lives in its own submodule. @@ -17,14 +19,18 @@ For example, currently we have: - ... and others So, to implement a new transport mechanism, you need to create a new module. -Your module should expose the following: +Your module must expose the following: ```python -XXX_SCHEMA = ... +SCHEMA = ... """The name of the mechanism, e.g. s3, ssh, etc. This is the part that goes before the `://` in a URL, e.g. `s3://`.""" +URI_EXAMPLES = ('xxx://foo/bar', 'zzz://baz/boz') +"""This will appear in the documentation of the the `parse_uri` function.""" + + def parse_uri(uri_as_str): """Parse the specified URI into a dict. @@ -34,7 +40,34 @@ def parse_uri(uri_as_str): def open_uri(uri_as_str, mode, transport_params): - """Return a file-like object pointing to the URI.""" + """Return a file-like object pointing to the URI. + + Parameters: + + uri_as_str: str + The URI to open + mode: str + Either "rb" or "wb". You don't need to implement text modes, + `smart_open` does that for you, outside of the transport layer. + transport_params: dict + Any additional parameters to pass to the `open` function (see below). + + """ + # + # Parse the URI using parse_uri + # Consolidate the parsed URI with transport_params, if needed + # Pass everything to the open function (see below). + # + ... + + +def open(..., mode, param1=None, param2=None, paramN=None): + """This function does the hard work. + + The keyword parameters are the transport_params from the `open_uri` + function. + + """ ... ``` @@ -43,7 +76,18 @@ You may define other functions and classes as necessary for your implementation. Once your module is working, register it in the `smart_open/smart_open_lib.py` file. The `_generate_transport()` generator builds a dictionary that maps schemes to the modules that implement functionality for them. -Include your new mechanism in that generator, and `smart_open` will be able to use it. + +Once you've registered your new transport module, the following will happen automagically: + +1. `smart_open` will be able to open any URI supported by your module +2. The docstring for the `smart_open.open` function will contain a section + detailing the parameters for your transport module. +3. The docstring for the `parse_uri` function will include the schemas and + examples supported by your module. + +You can confirm the documentation changes by running: + + python -c 'help("smart_open")' ## New compression mechanisms @@ -61,6 +105,7 @@ def _handle_xz(file_obj, mode): import lzma return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) + register_compressor('.xz', _handle_xz) ``` diff --git a/smart_open/__init__.py b/smart_open/__init__.py index 3d7ed155..4f41fb88 100644 --- a/smart_open/__init__.py +++ b/smart_open/__init__.py @@ -16,6 +16,7 @@ The main functions are: * `open()`, which opens the given file for reading/writing +* `parse_uri()` * `s3_iter_bucket()`, which goes over all keys in an S3 bucket in parallel * `register_compressor()`, which registers callbacks for transparent compressor handling @@ -24,9 +25,16 @@ import logging from smart_open import version -from .smart_open_lib import open, smart_open, register_compressor +from .smart_open_lib import open, parse_uri, smart_open, register_compressor from .s3 import iter_bucket as s3_iter_bucket -__all__ = ['open', 'smart_open', 's3_iter_bucket', 'register_compressor'] + +__all__ = [ + 'open', + 'parse_uri', + 'register_compressor', + 's3_iter_bucket', + 'smart_open', +] __version__ = version.__version__ diff --git a/smart_open/file.py b/smart_open/file.py index d81df43b..d2ae66f5 100644 --- a/smart_open/file.py +++ b/smart_open/file.py @@ -9,12 +9,21 @@ import io import os.path -FILE_SCHEME = 'file' +SCHEME = 'file' + +URI_EXAMPLES = ( + './local/path/file', + '~/local/path/file', + 'local/path/file', + './local/path/file.gz', + 'file:///home/user/file', + 'file:///home/user/file.bz2', +) def parse_uri(uri_as_string): local_path = extract_local_path(uri_as_string) - return dict(scheme=FILE_SCHEME, uri_path=local_path) + return dict(scheme=SCHEME, uri_path=local_path) def open_uri(uri_as_string, mode, transport_params): diff --git a/smart_open/hdfs.py b/smart_open/hdfs.py index f2adf681..d792455b 100644 --- a/smart_open/hdfs.py +++ b/smart_open/hdfs.py @@ -24,19 +24,24 @@ logger = logging.getLogger(__name__) -HDFS_SCHEME = 'hdfs' +SCHEME = 'hdfs' + +URI_EXAMPLES = ( + 'hdfs:///path/file', + 'hdfs://path/file', +) def parse_uri(uri_as_string): split_uri = urlparse.urlsplit(uri_as_string) - assert split_uri.scheme == HDFS_SCHEME + assert split_uri.scheme == SCHEME uri_path = split_uri.netloc + split_uri.path uri_path = "/" + uri_path.lstrip("/") if not uri_path: raise RuntimeError("invalid HDFS URI: %r" % uri_as_string) - return dict(scheme=HDFS_SCHEME, uri_path=uri_path) + return dict(scheme=SCHEME, uri_path=uri_path) def open_uri(uri, mode, transport_params): diff --git a/smart_open/http.py b/smart_open/http.py index de901e79..2625cb1f 100644 --- a/smart_open/http.py +++ b/smart_open/http.py @@ -18,7 +18,7 @@ import smart_open.utils DEFAULT_BUFFER_SIZE = 128 * 1024 -SUPPORTED_SCHEMES = ('http', 'https') +SCHEMES = ('http', 'https') logger = logging.getLogger(__name__) @@ -34,7 +34,7 @@ def parse_uri(uri_as_string): split_uri = urlparse.urlsplit(uri_as_string) - assert split_uri.scheme in SUPPORTED_SCHEMES + assert split_uri.scheme in SCHEMES uri_path = split_uri.netloc + split_uri.path uri_path = "/" + uri_path.lstrip("/") diff --git a/smart_open/s3.py b/smart_open/s3.py index 014820e8..215d5965 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -49,7 +49,7 @@ BINARY_NEWLINE = b'\n' -SUPPORTED_SCHEMES = ("s3", "s3n", 's3u', "s3a") +SCHEMES = ("s3", "s3n", 's3u', "s3a") DEFAULT_PORT = 443 DEFAULT_HOST = 's3.amazonaws.com' @@ -60,6 +60,12 @@ END = 2 WHENCE_CHOICES = [START, CURRENT, END] +URI_EXAMPLES = ( + 's3://my_bucket/my_key', + 's3://my_key:my_secret@my_bucket/my_key', + 's3://my_key:my_secret@my_server:my_port@my_bucket/my_key', +) + def _my_urlsplit(url): """This is a hack to prevent the regular urlsplit from splitting around question marks. @@ -97,7 +103,7 @@ def parse_uri(uri_as_string): # let boto3 take care of that for us. # split_uri = _my_urlsplit(uri_as_string) - assert split_uri.scheme in SUPPORTED_SCHEMES + assert split_uri.scheme in SCHEMES port = DEFAULT_PORT host = boto.config.get('s3', 'host', DEFAULT_HOST) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 8d462e25..d07ca061 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -12,7 +12,6 @@ * ``parse_uri()`` * ``open()`` - * ``register_compressor()`` """ @@ -78,14 +77,14 @@ def _generate_transport(): yield NO_SCHEME, so_file - yield so_file.FILE_SCHEME, so_file - yield so_hdfs.HDFS_SCHEME, so_hdfs - yield so_webhdfs.WEBHDFS_SCHEME, so_webhdfs - for scheme in so_s3.SUPPORTED_SCHEMES: + yield so_file.SCHEME, so_file + yield so_hdfs.SCHEME, so_hdfs + yield so_webhdfs.SCHEME, so_webhdfs + for scheme in so_s3.SCHEMES: yield scheme, so_s3 - for scheme in so_ssh.SUPPORTED_SCHEMES: + for scheme in so_ssh.SCHEMES: yield scheme, so_ssh - for scheme in so_http.SUPPORTED_SCHEMES: + for scheme in so_http.SCHEMES: yield scheme, so_http @@ -129,34 +128,13 @@ def parse_uri(uri_as_string): Supported URI schemes are: - * file - * hdfs - * http - * https - * s3 - * s3a - * s3n - * s3u - * webhdfs - - .s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. +%(schemes)s + s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. Valid URI examples:: - * s3://my_bucket/my_key - * s3://my_key:my_secret@my_bucket/my_key - * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key - * hdfs:///path/file - * hdfs://path/file - * webhdfs://host:port/path/file - * ./local/path/file - * ~/local/path/file - * local/path/file - * ./local/path/file.gz - * file:///home/user/file - * file:///home/user/file.bz2 - * [ssh|scp|sftp]://username@host//path/file - * [ssh|scp|sftp]://username@host/path/file +%(uri_examples)s + """ scheme = _sniff_scheme(uri_as_string) @@ -203,13 +181,8 @@ def open( ): r"""Open the URI object, returning a file-like object. - The URI is usually a string in a variety of formats: - - 1. a URI for the local filesystem: `./lines.txt`, `/home/joe/lines.txt.gz`, - `file:///home/joe/lines.txt.bz2` - 2. a URI for HDFS: `hdfs:///some/path/lines.txt` - 3. a URI for Amazon's S3 (can also supply credentials inside the URI): - `s3://my_bucket/lines.txt`, `s3://my_aws_key_id:key_secret@my_bucket/lines.txt` + The URI is usually a string in a variety of formats. + For a full list of examples, see the :func:`parse_uri` function. The URI may also be one of: @@ -217,10 +190,9 @@ def open( - a stream (anything that implements io.IOBase-like functionality) This function supports transparent compression and decompression using the - following codec: + following codecs: - - ``.gz`` - - ``.bz2`` +%(codecs)s The function depends on the file extension to determine the appropriate codec. @@ -347,30 +319,6 @@ def open( return decoded -# -# The docstring can be None if -OO was passed to the interpreter. -# -open.__doc__ = None if open.__doc__ is None else open.__doc__ % { - 's3': doctools.to_docstring( - doctools.extract_kwargs(so_s3.open.__doc__), - lpad=u' ', - ), - 'http': doctools.to_docstring( - doctools.extract_kwargs(so_http.open.__doc__), - lpad=u' ', - ), - 'webhdfs': doctools.to_docstring( - doctools.extract_kwargs(so_webhdfs.open.__doc__), - lpad=u' ', - ), - 'ssh': doctools.to_docstring( - doctools.extract_kwargs(so_ssh.open.__doc__), - lpad=u' ', - ), - 'examples': doctools.extract_examples_from_readme_rst(), -} - - _MIGRATION_NOTES_URL = ( 'https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst' '#migrating-to-the-new-open-function' @@ -601,3 +549,55 @@ def _encoding_wrapper(fileobj, mode, encoding=None, errors=None): if mode[0] in ('w', 'a') or mode.endswith('+'): fileobj = codecs.getwriter(encoding)(fileobj, **kw) return fileobj + + +def _tweak_docstrings(): + seen = set() + substrings = {} + schemes = io.StringIO() + seen_examples = set() + uri_examples = io.StringIO() + + for scheme, transport in sorted(_TRANSPORT.items()): + if scheme == NO_SCHEME: + continue + + schemes.write(' * %s\n' % scheme) + + try: + fn = transport.open + except AttributeError: + substrings[scheme] = '' + else: + kwargs = doctools.extract_kwargs(fn.__doc__) + substrings[scheme] = doctools.to_docstring(kwargs, lpad=u' ') + + try: + examples = transport.URI_EXAMPLES + except AttributeError: + continue + else: + for e in examples: + if e not in seen_examples: + uri_examples.write(' * %s\n' % e) + seen_examples.add(e) + + substrings['codecs'] = '\n'.join( + [' * %s' % e for e in compression.get_supported_extensions()] + ) + substrings['examples'] = doctools.extract_examples_from_readme_rst() + + # + # The docstring can be None if -OO was passed to the interpreter. + # + if open.__doc__: + open.__doc__ = open.__doc__ % substrings + + if parse_uri.__doc__: + parse_uri.__doc__ = parse_uri.__doc__ % dict( + schemes=schemes.getvalue(), + uri_examples=uri_examples.getvalue(), + ) + + +_tweak_docstrings() diff --git a/smart_open/ssh.py b/smart_open/ssh.py index 07ee6103..2c03de94 100644 --- a/smart_open/ssh.py +++ b/smart_open/ssh.py @@ -37,11 +37,16 @@ # _SSH = {} -SUPPORTED_SCHEMES = ("ssh", "scp", "sftp") +SCHEMES = ("ssh", "scp", "sftp") """Supported URL schemes.""" DEFAULT_PORT = 22 +URI_EXAMPLES = ( + '[ssh|scp|sftp]://username@host//path/file', + '[ssh|scp|sftp]://username@host/path/file', +) + def _unquote(text): return text and urlparse.unquote(text) diff --git a/smart_open/webhdfs.py b/smart_open/webhdfs.py index eb66ca97..ac9d3f8e 100644 --- a/smart_open/webhdfs.py +++ b/smart_open/webhdfs.py @@ -28,13 +28,17 @@ logger = logging.getLogger(__name__) -WEBHDFS_SCHEME = 'webhdfs' +SCHEME = 'webhdfs' -WEBHDFS_MIN_PART_SIZE = 50 * 1024**2 # minimum part size for HDFS multipart uploads +URI_EXAMPLES = ( + 'webhdfs://host:port/path/file', +) + +MIN_PART_SIZE = 50 * 1024**2 # minimum part size for HDFS multipart uploads def parse_uri(uri_as_str): - return dict(scheme=WEBHDFS_SCHEME, uri=uri_as_str) + return dict(scheme=SCHEME, uri=uri_as_str) def open_uri(uri, mode, transport_params): @@ -42,7 +46,7 @@ def open_uri(uri, mode, transport_params): return open(uri, mode, **kwargs) -def open(http_uri, mode, min_part_size=WEBHDFS_MIN_PART_SIZE): +def open(http_uri, mode, min_part_size=MIN_PART_SIZE): """ Parameters ---------- @@ -52,7 +56,7 @@ def open(http_uri, mode, min_part_size=WEBHDFS_MIN_PART_SIZE): For writing only. """ - if http_uri.startswith(WEBHDFS_SCHEME): + if http_uri.startswith(SCHEME): http_uri = _convert_to_http_uri(http_uri) if mode == 'rb': @@ -167,7 +171,7 @@ def readline(self): class BufferedOutputBase(io.BufferedIOBase): - def __init__(self, uri, min_part_size=WEBHDFS_MIN_PART_SIZE): + def __init__(self, uri, min_part_size=MIN_PART_SIZE): """ Parameters ---------- From 57c459fdaa7d867029416a2cadeae8551f0bfe26 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 16:17:08 +0900 Subject: [PATCH 04/32] link to extending.md from README.rst --- README.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.rst b/README.rst index cc18e300..deef86ec 100644 --- a/README.rst +++ b/README.rst @@ -333,6 +333,11 @@ If your file object doesn't have one, set the ``.name`` attribute to an appropri Furthermore, that value has to end with a **known** file extension (see the ``register_compressor`` function). Otherwise, the transparent decompression will not occur. +Extending ``smart_open`` +======================== + +See `this document `__. + Comments, bug reports ===================== From 3961dbb1b8eed6e9cd40facf1a2b2f3fbe577ca2 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 16:17:14 +0900 Subject: [PATCH 05/32] fixup --- smart_open/smart_open_lib.py | 2 +- smart_open/ssh.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index d07ca061..7ed632d9 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -423,7 +423,7 @@ def _shortcut_open( return None scheme = _sniff_scheme(uri) - if scheme not in (NO_SCHEME, so_file.FILE_SCHEME): + if scheme not in (NO_SCHEME, so_file.SCHEME): return None local_path = so_file.extract_local_path(uri) diff --git a/smart_open/ssh.py b/smart_open/ssh.py index 2c03de94..cdb50d9d 100644 --- a/smart_open/ssh.py +++ b/smart_open/ssh.py @@ -54,7 +54,7 @@ def _unquote(text): def parse_uri(uri_as_string): split_uri = urlparse.urlsplit(uri_as_string) - assert split_uri.scheme in SUPPORTED_SCHEMES + assert split_uri.scheme in SCHEMES return dict( scheme=split_uri.scheme, uri_path=_unquote(split_uri.path), From 463b060731f41b4380575b558cfa5f1b112220d7 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 16:18:20 +0900 Subject: [PATCH 06/32] improve my_urlsplit function name --- smart_open/s3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smart_open/s3.py b/smart_open/s3.py index 215d5965..a5f99df8 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -67,7 +67,7 @@ ) -def _my_urlsplit(url): +def _safe_urlsplit(url): """This is a hack to prevent the regular urlsplit from splitting around question marks. A question mark (?) in a URL typically indicates the start of a @@ -102,7 +102,7 @@ def parse_uri(uri_as_string): # We use the above as a guide only, and do not perform any validation. We # let boto3 take care of that for us. # - split_uri = _my_urlsplit(uri_as_string) + split_uri = _safe_urlsplit(uri_as_string) assert split_uri.scheme in SCHEMES port = DEFAULT_PORT From 4f287dfd771d69ae4f672992218650b119c362cf Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 16:19:38 +0900 Subject: [PATCH 07/32] improve docstring --- smart_open/compression.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/smart_open/compression.py b/smart_open/compression.py index c71fb791..ac76c224 100644 --- a/smart_open/compression.py +++ b/smart_open/compression.py @@ -31,9 +31,11 @@ def register_compressor(ext, callback): Parameters ---------- ext: str - The extension. + The extension. Must include the leading period, e.g. ``.gz``. callback: callable The callback. It must accept two position arguments, file_obj and mode. + This function will be called when ``smart_open`` is opening a file with + the specified extension. Examples -------- From 3dcb71a4271c78b370ee15a76d3c2dfa8b51c21f Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 16:33:31 +0900 Subject: [PATCH 08/32] remove unused variable --- smart_open/smart_open_lib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 7ed632d9..008ce3eb 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -552,7 +552,6 @@ def _encoding_wrapper(fileobj, mode, encoding=None, errors=None): def _tweak_docstrings(): - seen = set() substrings = {} schemes = io.StringIO() seen_examples = set() From 4ee4490d2ddf3986774cd2c7c3a0d614367ea376 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 17:19:38 +0900 Subject: [PATCH 09/32] fixup --- smart_open/compression.py | 14 +++++++++++--- smart_open/smart_open_lib.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/smart_open/compression.py b/smart_open/compression.py index ac76c224..e57b0269 100644 --- a/smart_open/compression.py +++ b/smart_open/compression.py @@ -70,18 +70,26 @@ def _handle_gzip(file_obj, mode): return gzip.GzipFile(fileobj=file_obj, mode=mode) -def compression_wrapper(file_obj, filename, mode): +def compression_wrapper(file_obj, mode): """ This function will wrap the file_obj with an appropriate [de]compression mechanism based on the extension of the filename. file_obj must either be a filehandle object, or a class which behaves - like one. + like one. It must have a .name attribute. If the filename extension isn't recognized, will simply return the original file_obj. """ - _, ext = os.path.splitext(filename) + + try: + _, ext = os.path.splitext(file_obj.name) + except (AttributeError, TypeError): + logger.warning( + 'unable to transparently decompress %r because it ' + 'seems to lack a string-like .name', file_obj + ) + return file_obj if _need_to_buffer(file_obj, mode, ext): warnings.warn('streaming gzip support unavailable, see %s' % _ISSUE_189_URL) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 008ce3eb..2ff540f2 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -309,7 +309,7 @@ def open( if ignore_ext: decompressed = binary else: - decompressed = compression.compression_wrapper(binary, binary.name, mode) + decompressed = compression.compression_wrapper(binary, mode) if 'b' not in mode or explicit_encoding is not None: decoded = _encoding_wrapper(decompressed, mode, encoding=encoding, errors=errors) From 2c8a4f26d515cab6decd42405d9b34cb12b9996e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 17:28:51 +0900 Subject: [PATCH 10/32] disable docstring tweaking on Py2 --- smart_open/smart_open_lib.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 2ff540f2..06d2d66c 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -599,4 +599,9 @@ def _tweak_docstrings(): ) -_tweak_docstrings() +# +# The code below doesn't work on Py2. We _could_ make it work, but given that +# it's 2020 and Py2 is on it's way out, I'm just going to disable it. +# +if not six.PY2: + _tweak_docstrings() From f4896897a57078907cca42c790faf1d39d519129 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 17:40:32 +0900 Subject: [PATCH 11/32] more Py27 goodness --- smart_open/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/smart_open/utils.py b/smart_open/utils.py index 4fdae33a..988829a5 100644 --- a/smart_open/utils.py +++ b/smart_open/utils.py @@ -14,7 +14,14 @@ def inspect_kwargs(kallable): try: signature = inspect.signature(kallable) except AttributeError: - args, varargs, keywords, defaults = inspect.getargspec(kallable) + try: + args, varargs, keywords, defaults = inspect.getargspec(kallable) + except TypeError: + # + # Happens under Py2.7 with mocking. + # + return {} + if not defaults: return {} supported_keywords = args[-len(defaults):] From b22e3b02e8424e1e0c68c3f2130cef54a9d6aaa6 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 12 Jan 2020 23:22:42 +0900 Subject: [PATCH 12/32] add section to extending.md --- extending.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/extending.md b/extending.md index bbe75789..7cbb152b 100644 --- a/extending.md +++ b/extending.md @@ -89,6 +89,22 @@ You can confirm the documentation changes by running: python -c 'help("smart_open")' +### What's the difference between the `open_uri` and `open` functions? + +There are several key differences between the two. + +First, the parameters to `open_uri` are the same for _all transports_. +On the other hand, the parameters to the `open` function can differ from transport to transport. + +Second, the responsibilities of the two functions are also different. +The `open` function opens the remote object. +The `open_uri` function deals with parsing transport-specific details out of the URI, and then delegates to `open`. + +The `open` function contains documentation for transport parameters. +This documentation gets parsed by the `doctools` module and appears in various docstrings. + +Some of these differences are by design; others as a consequence of evolution. + ## New compression mechanisms The compression layer is self-contained in the `smart_open.compression` submodule. From 1cc60eaa6ddd3083f86e31402038421159bde655 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 30 Jan 2020 15:13:33 +0900 Subject: [PATCH 13/32] improving transport submodule registration --- smart_open/{file.py => local_file.py} | 0 smart_open/smart_open_lib.py | 46 ++++++++++++++++----------- 2 files changed, 27 insertions(+), 19 deletions(-) rename smart_open/{file.py => local_file.py} (100%) diff --git a/smart_open/file.py b/smart_open/local_file.py similarity index 100% rename from smart_open/file.py rename to smart_open/local_file.py diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 452ff22f..a43c43c2 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -34,7 +34,7 @@ # This module defines a function called smart_open so we cannot use # smart_open.submodule to reference to the submodules. # -import smart_open.file as so_file +import smart_open.local_file as so_file import smart_open.s3 as so_s3 import smart_open.hdfs as so_hdfs import smart_open.webhdfs as so_webhdfs @@ -75,25 +75,33 @@ 'a': 'ab', 'a+': 'ab+', 'at': 'ab', 'at+': 'ab+', } +_TRANSPORT = {NO_SCHEME: so_file} -def _generate_transport(): - yield NO_SCHEME, so_file - yield so_file.SCHEME, so_file - yield so_hdfs.SCHEME, so_hdfs - yield so_webhdfs.SCHEME, so_webhdfs - yield so_gcs.SCHEME, so_gcs - for scheme in so_s3.SCHEMES: - yield scheme, so_s3 - for scheme in so_ssh.SCHEMES: - yield scheme, so_ssh - for scheme in so_http.SCHEMES: - yield scheme, so_http - - -_TRANSPORT = dict(_generate_transport()) -for schema, transport in _TRANSPORT.items(): - assert hasattr(transport, 'open_uri'), '%r is missing open_uri' % schema - assert hasattr(transport, 'parse_uri'), '%r is missing parse_uri' % schema + +def _register_transport(submodule): + global _TRANSPORT + if hasattr(submodule, 'SCHEME'): + schemes = [submodule.SCHEME] + elif hasattr(submodule, 'SCHEMES'): + schemes = submodule.SCHEMES + else: + raise ValueError('%r does not have a .SCHEME or .SCHEMES attribute' % submodule) + + assert hasattr(submodule, 'open_uri'), '%r is missing open_uri' % submodule + assert hasattr(submodule, 'parse_uri'), '%r is missing parse_uri' % submodule + + for scheme in schemes: + assert scheme not in _TRANSPORT + _TRANSPORT[scheme] = submodule + + +_register_transport(so_file) +_register_transport(so_gcs) +_register_transport(so_hdfs) +_register_transport(so_http) +_register_transport(so_s3) +_register_transport(so_ssh) +_register_transport(so_webhdfs) SUPPORTED_SCHEMES = tuple(sorted(_TRANSPORT.keys())) """The transport schemes that ``smart_open`` supports.""" From 4d3b1a7882eeff2cbaa934a8c5c1e246f846eb44 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 30 Jan 2020 15:13:51 +0900 Subject: [PATCH 14/32] integrating gcs into new design --- smart_open/gcs.py | 15 ++++++++++++++ smart_open/tests/test_gcs.py | 39 ++++++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 7d08421b..a5ca37a1 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -15,6 +15,7 @@ import google.cloud.storage import google.auth.transport.requests as google_requests import six +from six.moves.urllib import parse as urlparse import smart_open.bytebuffer import smart_open.s3 @@ -91,6 +92,20 @@ def __init__(self, message, status_code, text): self.text = text +def parse_uri(uri_as_string): + sr = urlparse.urlsplit(uri_as_string) + assert sr.scheme == SCHEME + bucket_id = sr.netloc + blob_id = sr.path.lstrip('/') + return dict(scheme=SCHEME, bucket_id=bucket_id, blob_id=blob_id) + + +def open_uri(uri, mode, transport_params): + parsed_uri = parse_uri(uri) + kwargs = smart_open.utils.check_kwargs(open, transport_params) + return open(parsed_uri['bucket_id'], parsed_uri['blob_id'], mode, **kwargs) + + def open( bucket_id, blob_id, diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index f021dd78..be0e1b06 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -434,18 +434,30 @@ def mock_gcs(class_or_func): def mock_gcs_func(func): """Mock the function and provide additional required arguments.""" + assert callable(func), '%r is not a callable function' % func + def inner(*args, **kwargs): - with mock.patch('google.cloud.storage.Client', return_value=storage_client), \ - mock.patch( - 'smart_open.gcs.google_requests.AuthorizedSession', - return_value=FakeAuthorizedSession(storage_client._credentials), - ): - assert callable(func), 'you didn\'t provide a function!' - try: # is it a method that needs a self arg? - self_arg = inspect.signature(func).self - func(self_arg, *args, **kwargs) - except AttributeError: - func(*args, **kwargs) + # + # Is it a function or a method? The latter requires a self parameter. + # + signature = inspect.signature(func) + + fake_session = FakeAuthorizedSession(storage_client._credentials) + patched_client = mock.patch( + 'google.cloud.storage.Client', + return_value=storage_client, + ) + patched_session = mock.patch( + 'smart_open.gcs.google_requests.AuthorizedSession', + return_value=fake_session, + ) + + with patched_client, patched_session: + if not hasattr(signature, 'self'): + return func(*args, **kwargs) + else: + return func(signature.self, *args, **kwargs) + return inner @@ -666,11 +678,12 @@ def test_write_01(self): with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fout: fout.write(test_string) - output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb")) + with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb") as fin: + output = list(fin) self.assertEqual(output, [test_string]) - def test_write_01a(self): + def test_incorrect_input(self): """Does gcs write fail on incorrect input?""" try: with smart_open.gcs.BufferedOutputBase(BUCKET_NAME, WRITE_BLOB_NAME) as fin: From 64f43f08c10b678b03cb66027e3204e75632f103 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 30 Jan 2020 15:16:25 +0900 Subject: [PATCH 15/32] disable moto server by default --- .travis.yml | 7 +++---- smart_open/tests/test_s3.py | 7 +++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 116fdabc..c6140c44 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,23 +11,22 @@ matrix: - python: '2.7' env: - SO_DISABLE_MOCKS: "1" - - SO_DISABLE_MOTO_SERVER: "1" - SO_S3_URL: "s3://smart-open-py27-benchmark" - SO_S3_RESULT_URL: "s3://smart-open-py27-benchmark-results" - python: '3.5' + env: + - SO_ENABLE_MOTO_SERVER: "1" - python: '3.6' env: - SO_DISABLE_MOCKS: "1" - - SO_DISABLE_MOTO_SERVER: "1" - SO_S3_URL: "s3://smart-open-py36-benchmark" - SO_S3_RESULT_URL: "s3://smart-open-py36-benchmark-results" - python: '3.7' env: - SO_DISABLE_MOCKS: "1" - - SO_DISABLE_MOTO_SERVER: "1" - SO_S3_URL: "s3://smart-open-py37-benchmark" - SO_S3_RESULT_URL: "s3://smart-open-py37-benchmark-results" - BOTO_CONFIG: "/dev/null" @@ -49,7 +48,7 @@ script: unset SO_S3_URL; unset SO_S3_RESULT_URL; fi - - if [[ ${SO_DISABLE_MOTO_SERVER} -ne 1 ]]; then + - if [[ ${SO_ENABLE_MOTO_SERVER} = "1" ]]; then sh -c "moto_server -p5000 2> /dev/null &"; fi - flake8 --max-line-length=110 diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index 4d793e36..89b6c7c5 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -34,7 +34,7 @@ KEY_NAME = 'test-key' WRITE_KEY_NAME = 'test-write-key' DISABLE_MOCKS = os.environ.get('SO_DISABLE_MOCKS') == "1" -DISABLE_MOTO_SERVER = os.environ.get("SO_DISABLE_MOTO_SERVER") == "1" +ENABLE_MOTO_SERVER = os.environ.get("SO_ENABLE_MOTO_SERVER") == "1" logger = logging.getLogger(__name__) @@ -105,7 +105,10 @@ def ignore_resource_warnings(): warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") # noqa -@unittest.skipIf(DISABLE_MOTO_SERVER, 'The test case needs a Moto server running on the local 5000 port.') +@unittest.skipUnless( + ENABLE_MOTO_SERVER, + 'The test case needs a Moto server running on the local 5000 port.' +) class SeekableRawReaderTest(unittest.TestCase): def setUp(self): From 6110269e726934f23490b08f21786dc91bca4c82 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 13:37:29 +0900 Subject: [PATCH 16/32] import submodules via importlib for flexibility --- smart_open/smart_open_lib.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index a43c43c2..d9cbb267 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -35,12 +35,6 @@ # smart_open.submodule to reference to the submodules. # import smart_open.local_file as so_file -import smart_open.s3 as so_s3 -import smart_open.hdfs as so_hdfs -import smart_open.webhdfs as so_webhdfs -import smart_open.http as so_http -import smart_open.ssh as so_ssh -import smart_open.gcs as so_gcs from smart_open import compression from smart_open import doctools @@ -80,6 +74,13 @@ def _register_transport(submodule): global _TRANSPORT + if isinstance(submodule, str): + try: + submodule = importlib.import_module(submodule) + except ImportError: + _LOGGER.warning('unable to import %r, disabling that module', submodule) + return + if hasattr(submodule, 'SCHEME'): schemes = [submodule.SCHEME] elif hasattr(submodule, 'SCHEMES'): @@ -96,12 +97,12 @@ def _register_transport(submodule): _register_transport(so_file) -_register_transport(so_gcs) -_register_transport(so_hdfs) -_register_transport(so_http) -_register_transport(so_s3) -_register_transport(so_ssh) -_register_transport(so_webhdfs) +_register_transport('smart_open.gcs') +_register_transport('smart_open.hdfs') +_register_transport('smart_open.http') +_register_transport('smart_open.s3') +_register_transport('smart_open.ssh') +_register_transport('smart_open.webhdfs') SUPPORTED_SCHEMES = tuple(sorted(_TRANSPORT.keys())) """The transport schemes that ``smart_open`` supports.""" From abf4fef3393d4145e654002bdd9c78b178fbc4a4 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 14:07:50 +0900 Subject: [PATCH 17/32] move tweak function to doctools --- smart_open/doctools.py | 61 +++++++++++++++++++++++++++++++++++ smart_open/smart_open_lib.py | 62 +++--------------------------------- smart_open/utils.py | 8 +++++ 3 files changed, 73 insertions(+), 58 deletions(-) diff --git a/smart_open/doctools.py b/smart_open/doctools.py index dd5d7490..b31e5fea 100644 --- a/smart_open/doctools.py +++ b/smart_open/doctools.py @@ -16,6 +16,12 @@ import os.path import re +import six + +from . import compression + +_NO_SCHEME = '' + def extract_kwargs(docstring): """Extract keyword argument documentation from a function's docstring. @@ -156,3 +162,58 @@ def extract_examples_from_readme_rst(indent=' '): return ''.join([indent + re.sub('^ ', '', l) for l in lines]) except Exception: return indent + 'See README.rst' + + +def tweak_docstrings(open_function, parse_uri_function, transport): + # + # The code below doesn't work on Py2. We _could_ make it work, but given + # that it's 2020 and Py2 is on it's way out, I'm just going to disable it. + # + if six.PY2: + return + + substrings = {} + schemes = io.StringIO() + seen_examples = set() + uri_examples = io.StringIO() + + for scheme, transport in sorted(transport.items()): + if scheme == _NO_SCHEME: + continue + + schemes.write(' * %s\n' % scheme) + + try: + fn = transport.open + except AttributeError: + substrings[scheme] = '' + else: + kwargs = extract_kwargs(fn.__doc__) + substrings[scheme] = to_docstring(kwargs, lpad=u' ') + + try: + examples = transport.URI_EXAMPLES + except AttributeError: + continue + else: + for e in examples: + if e not in seen_examples: + uri_examples.write(' * %s\n' % e) + seen_examples.add(e) + + substrings['codecs'] = '\n'.join( + [' * %s' % e for e in compression.get_supported_extensions()] + ) + substrings['examples'] = extract_examples_from_readme_rst() + + # + # The docstring can be None if -OO was passed to the interpreter. + # + if open_function.__doc__: + open_function.__doc__ = open_function.__doc__ % substrings + + if parse_uri_function.__doc__: + parse_uri_function.__doc__ = parse_uri_function.__doc__ % dict( + schemes=schemes.getvalue(), + uri_examples=uri_examples.getvalue(), + ) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 67321c44..a79c0c34 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -105,7 +105,7 @@ def _register_transport(submodule): _register_transport('smart_open.webhdfs') SUPPORTED_SCHEMES = tuple(sorted(_TRANSPORT.keys())) -"""The transport schemes that ``smart_open`` supports.""" +"""The transport schemes that the local installation of ``smart_open`` supports.""" def _sniff_scheme(uri_as_string): @@ -137,7 +137,6 @@ def parse_uri(uri_as_string): Notes ----- -<<<<<<< HEAD Supported URI schemes are: %(schemes)s @@ -563,62 +562,6 @@ def _encoding_wrapper(fileobj, mode, encoding=None, errors=None): return fileobj -def _tweak_docstrings(): - substrings = {} - schemes = io.StringIO() - seen_examples = set() - uri_examples = io.StringIO() - - for scheme, transport in sorted(_TRANSPORT.items()): - if scheme == NO_SCHEME: - continue - - schemes.write(' * %s\n' % scheme) - - try: - fn = transport.open - except AttributeError: - substrings[scheme] = '' - else: - kwargs = doctools.extract_kwargs(fn.__doc__) - substrings[scheme] = doctools.to_docstring(kwargs, lpad=u' ') - - try: - examples = transport.URI_EXAMPLES - except AttributeError: - continue - else: - for e in examples: - if e not in seen_examples: - uri_examples.write(' * %s\n' % e) - seen_examples.add(e) - - substrings['codecs'] = '\n'.join( - [' * %s' % e for e in compression.get_supported_extensions()] - ) - substrings['examples'] = doctools.extract_examples_from_readme_rst() - - # - # The docstring can be None if -OO was passed to the interpreter. - # - if open.__doc__: - open.__doc__ = open.__doc__ % substrings - - if parse_uri.__doc__: - parse_uri.__doc__ = parse_uri.__doc__ % dict( - schemes=schemes.getvalue(), - uri_examples=uri_examples.getvalue(), - ) - - -# -# The code below doesn't work on Py2. We _could_ make it work, but given that -# it's 2020 and Py2 is on it's way out, I'm just going to disable it. -# -if not six.PY2: - _tweak_docstrings() - - class patch_pathlib(object): """Replace `Path.open` with `smart_open.open`""" @@ -641,3 +584,6 @@ def _patch_pathlib(func): old_impl = pathlib.Path.open pathlib.Path.open = func return old_impl + + +doctools.tweak_docstrings(open, parse_uri, _TRANSPORT) diff --git a/smart_open/utils.py b/smart_open/utils.py index 988829a5..0eb9a922 100644 --- a/smart_open/utils.py +++ b/smart_open/utils.py @@ -1,3 +1,11 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2020 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# +"""Helper functions for documentation, etc.""" import inspect import logging From 110a55747de988ab5a22f8273edb9feb69ae98d0 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 15:50:19 +0900 Subject: [PATCH 18/32] split out separate transport.py submodule --- extending.md | 6 ++- smart_open/doctools.py | 19 ++++---- smart_open/local_file.py | 3 ++ smart_open/smart_open_lib.py | 79 +++------------------------------ smart_open/transport.py | 86 ++++++++++++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 82 deletions(-) create mode 100644 smart_open/transport.py diff --git a/extending.md b/extending.md index 7cbb152b..94727117 100644 --- a/extending.md +++ b/extending.md @@ -74,8 +74,8 @@ def open(..., mode, param1=None, param2=None, paramN=None): Have a look at the existing mechanisms to see how they work. You may define other functions and classes as necessary for your implementation. -Once your module is working, register it in the `smart_open/smart_open_lib.py` file. -The `_generate_transport()` generator builds a dictionary that maps schemes to the modules that implement functionality for them. +Once your module is working, register it in the `smart_open/transport.py` file. +The `generate_transport()` generator updates a dictionary that maps schemes to the modules that implement functionality for them. Once you've registered your new transport module, the following will happen automagically: @@ -89,6 +89,8 @@ You can confirm the documentation changes by running: python -c 'help("smart_open")' +and verify that documentation for your new submodule shows up. + ### What's the difference between the `open_uri` and `open` functions? There are several key differences between the two. diff --git a/smart_open/doctools.py b/smart_open/doctools.py index b31e5fea..fdcbed6c 100644 --- a/smart_open/doctools.py +++ b/smart_open/doctools.py @@ -19,8 +19,7 @@ import six from . import compression - -_NO_SCHEME = '' +from . import transport def extract_kwargs(docstring): @@ -79,8 +78,12 @@ def extract_kwargs(docstring): # 1. Find the underlined 'Parameters' section # 2. Once there, continue parsing parameters until we hit an empty line # - while lines[0] != 'Parameters': + while lines and lines[0] != 'Parameters': lines.pop(0) + + if not lines: + return [] + lines.pop(0) lines.pop(0) @@ -164,7 +167,7 @@ def extract_examples_from_readme_rst(indent=' '): return indent + 'See README.rst' -def tweak_docstrings(open_function, parse_uri_function, transport): +def tweak_docstrings(open_function, parse_uri_function): # # The code below doesn't work on Py2. We _could_ make it work, but given # that it's 2020 and Py2 is on it's way out, I'm just going to disable it. @@ -177,14 +180,14 @@ def tweak_docstrings(open_function, parse_uri_function, transport): seen_examples = set() uri_examples = io.StringIO() - for scheme, transport in sorted(transport.items()): - if scheme == _NO_SCHEME: + for scheme, submodule in sorted(transport._REGISTRY.items()): + if scheme == transport.NO_SCHEME: continue schemes.write(' * %s\n' % scheme) try: - fn = transport.open + fn = submodule.open except AttributeError: substrings[scheme] = '' else: @@ -192,7 +195,7 @@ def tweak_docstrings(open_function, parse_uri_function, transport): substrings[scheme] = to_docstring(kwargs, lpad=u' ') try: - examples = transport.URI_EXAMPLES + examples = submodule.URI_EXAMPLES except AttributeError: continue else: diff --git a/smart_open/local_file.py b/smart_open/local_file.py index d2ae66f5..e5f5c5aa 100644 --- a/smart_open/local_file.py +++ b/smart_open/local_file.py @@ -21,6 +21,9 @@ ) +open = io.open + + def parse_uri(uri_as_string): local_path = extract_local_path(uri_as_string) return dict(scheme=SCHEME, uri_path=local_path) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index a79c0c34..0654b745 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -38,6 +38,7 @@ from smart_open import compression from smart_open import doctools +from smart_open import transport from smart_open import utils # @@ -61,52 +62,12 @@ SYSTEM_ENCODING = sys.getdefaultencoding() -NO_SCHEME = '' - _TO_BINARY_LUT = { 'r': 'rb', 'r+': 'rb+', 'rt': 'rb', 'rt+': 'rb+', 'w': 'wb', 'w+': 'wb+', 'wt': 'wb', "wt+": 'wb+', 'a': 'ab', 'a+': 'ab+', 'at': 'ab', 'at+': 'ab+', } -_TRANSPORT = {NO_SCHEME: so_file} - - -def _register_transport(submodule): - global _TRANSPORT - if isinstance(submodule, str): - try: - submodule = importlib.import_module(submodule) - except ImportError: - _LOGGER.warning('unable to import %r, disabling that module', submodule) - return - - if hasattr(submodule, 'SCHEME'): - schemes = [submodule.SCHEME] - elif hasattr(submodule, 'SCHEMES'): - schemes = submodule.SCHEMES - else: - raise ValueError('%r does not have a .SCHEME or .SCHEMES attribute' % submodule) - - assert hasattr(submodule, 'open_uri'), '%r is missing open_uri' % submodule - assert hasattr(submodule, 'parse_uri'), '%r is missing parse_uri' % submodule - - for scheme in schemes: - assert scheme not in _TRANSPORT - _TRANSPORT[scheme] = submodule - - -_register_transport(so_file) -_register_transport('smart_open.gcs') -_register_transport('smart_open.hdfs') -_register_transport('smart_open.http') -_register_transport('smart_open.s3') -_register_transport('smart_open.ssh') -_register_transport('smart_open.webhdfs') - -SUPPORTED_SCHEMES = tuple(sorted(_TRANSPORT.keys())) -"""The transport schemes that the local installation of ``smart_open`` supports.""" - def _sniff_scheme(uri_as_string): """Returns the scheme of the URL only, as a string.""" @@ -149,18 +110,8 @@ def parse_uri(uri_as_string): """ scheme = _sniff_scheme(uri_as_string) - - try: - transport = _TRANSPORT[scheme] - except KeyError: - raise NotImplementedError("unknown URI scheme %r in %r" % (scheme, uri_as_string)) - - try: - parse_uri = getattr(transport, 'parse_uri') - except AttributeError: - raise NotImplementedError('%r transport does not implement parse_uri', scheme) - - as_dict = parse_uri(uri_as_string) + submodule = transport.get_transport(scheme) + as_dict = submodule.parse_uri(uri_as_string) # # The conversion to a namedtuple is just to keep the old tests happy while @@ -434,7 +385,7 @@ def _shortcut_open( return None scheme = _sniff_scheme(uri) - if scheme not in (NO_SCHEME, so_file.SCHEME): + if scheme not in (transport.NO_SCHEME, so_file.SCHEME): return None local_path = so_file.extract_local_path(uri) @@ -500,24 +451,8 @@ def _open_binary_stream(uri, mode, transport_params): raise TypeError("don't know how to handle uri %r" % uri) scheme = _sniff_scheme(uri) - - bad_scheme = NotImplementedError( - "scheme %r is not supported, expected one of %r" % ( - scheme, SUPPORTED_SCHEMES, - ) - ) - - try: - transport = _TRANSPORT[scheme] - except KeyError: - raise bad_scheme - - try: - open_uri = getattr(transport, 'open_uri') - except AttributeError: - raise bad_scheme - - fobj = open_uri(uri, mode, transport_params) + submodule = transport.get_transport(scheme) + fobj = submodule.open_uri(uri, mode, transport_params) if not hasattr(fobj, 'name'): logger.critical('TODO') fobj.name = 'unknown' @@ -586,4 +521,4 @@ def _patch_pathlib(func): return old_impl -doctools.tweak_docstrings(open, parse_uri, _TRANSPORT) +doctools.tweak_docstrings(open, parse_uri) diff --git a/smart_open/transport.py b/smart_open/transport.py new file mode 100644 index 00000000..60913985 --- /dev/null +++ b/smart_open/transport.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2020 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# +"""Maintains a registry of transport mechanisms. + +The main entrypoint is :func:`get_transport`. See also :file:`extending.md`. + +""" +import importlib +import logging + +import smart_open.local_file + +logger = logging.getLogger(__name__) + +NO_SCHEME = '' + +_REGISTRY = {NO_SCHEME: smart_open.local_file} + + +def register_transport(submodule): + """Register a submodule as a transport mechanism for ``smart_open``. + + This module **must** have: + + - `SCHEME` attribute (or `SCHEMES`, if the submodule supports multiple schemes) + - `open` function + - `open_uri` function + - `parse_uri' function + + Once registered, you can get the submodule by calling :func:`get_transport`. + + """ + global _REGISTRY + if isinstance(submodule, str): + try: + submodule = importlib.import_module(submodule) + except ImportError: + logger.warning('unable to import %r, disabling that module', submodule) + return + + if hasattr(submodule, 'SCHEME'): + schemes = [submodule.SCHEME] + elif hasattr(submodule, 'SCHEMES'): + schemes = submodule.SCHEMES + else: + raise ValueError('%r does not have a .SCHEME or .SCHEMES attribute' % submodule) + + for f in ('open', 'open_uri', 'parse_uri'): + assert hasattr(submodule, f), '%r is missing %r' % (submodule, f) + + for scheme in schemes: + assert scheme not in _REGISTRY + _REGISTRY[scheme] = submodule + + +def get_transport(scheme): + """Get the submodule that handles transport for the specified scheme. + + This submodule must have been previously registered via :func:`register_transport`. + + """ + message = "scheme %r is not supported, expected one of %r" % (scheme, SUPPORTED_SCHEMES) + + try: + submodule = _REGISTRY[scheme] + except KeyError: + raise NotImplementedError(message) + else: + return submodule + + +register_transport(smart_open.local_file) +register_transport('smart_open.gcs') +register_transport('smart_open.hdfs') +register_transport('smart_open.http') +register_transport('smart_open.s3') +register_transport('smart_open.ssh') +register_transport('smart_open.webhdfs') + +SUPPORTED_SCHEMES = tuple(sorted(_REGISTRY.keys())) +"""The transport schemes that the local installation of ``smart_open`` supports.""" From 12605ab8759c232a8bde9d6f84b0b6313d60b2f0 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 15:55:36 +0900 Subject: [PATCH 19/32] fixup --- extending.md | 2 +- smart_open/s3.py | 1 - smart_open/transport.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/extending.md b/extending.md index 94727117..5a1a296b 100644 --- a/extending.md +++ b/extending.md @@ -75,7 +75,7 @@ Have a look at the existing mechanisms to see how they work. You may define other functions and classes as necessary for your implementation. Once your module is working, register it in the `smart_open/transport.py` file. -The `generate_transport()` generator updates a dictionary that maps schemes to the modules that implement functionality for them. +The `register_transport()` function updates a mapping from schemes to the modules that implement functionality for them. Once you've registered your new transport module, the following will happen automagically: diff --git a/smart_open/s3.py b/smart_open/s3.py index db211ff8..52887192 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -21,7 +21,6 @@ import six from six.moves.urllib import parse as urlparse -from botocore.exceptions import IncompleteReadError import smart_open.bytebuffer import smart_open.utils diff --git a/smart_open/transport.py b/smart_open/transport.py index 60913985..2d43d862 100644 --- a/smart_open/transport.py +++ b/smart_open/transport.py @@ -65,7 +65,7 @@ def get_transport(scheme): """ message = "scheme %r is not supported, expected one of %r" % (scheme, SUPPORTED_SCHEMES) - + try: submodule = _REGISTRY[scheme] except KeyError: From 64b2fdd8c523326357a82a2a9ad27d445be683ce Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 17:00:40 +0900 Subject: [PATCH 20/32] get rid of Py2 --- README.rst | 2 +- setup.py | 1 - smart_open/hdfs.py | 5 +-- smart_open/http.py | 6 +-- smart_open/s3.py | 10 ++--- smart_open/smart_open_lib.py | 42 ++++--------------- smart_open/ssh.py | 7 ++-- smart_open/tests/test_gcs.py | 7 +--- smart_open/tests/test_s3.py | 3 -- smart_open/tests/test_smart_open.py | 54 +++++-------------------- smart_open/tests/test_smart_open_old.py | 22 +++------- tox.ini | 2 +- 12 files changed, 38 insertions(+), 123 deletions(-) diff --git a/README.rst b/README.rst index 5ba741b7..51bc27f7 100644 --- a/README.rst +++ b/README.rst @@ -14,7 +14,7 @@ smart_open — utils for streaming large files in Python What? ===== -``smart_open`` is a Python 2 & Python 3 library for **efficient streaming of very large files** from/to storages such as S3, GCS, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats. +``smart_open`` is a Python 3 library for **efficient streaming of very large files** from/to storages such as S3, GCS, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats. ``smart_open`` is a drop-in replacement for Python's built-in ``open()``: it can do anything ``open`` can (100% compatible, falls back to native ``open`` wherever possible), plus lots of nifty extra stuff on top. diff --git a/setup.py b/setup.py index 01e065f5..13076e3e 100644 --- a/setup.py +++ b/setup.py @@ -100,7 +100,6 @@ def read(fname): 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', diff --git a/smart_open/hdfs.py b/smart_open/hdfs.py index d792455b..a4d892cd 100644 --- a/smart_open/hdfs.py +++ b/smart_open/hdfs.py @@ -17,8 +17,7 @@ import io import logging import subprocess - -from six.moves.urllib import parse as urlparse +import urllib.parse from smart_open import utils @@ -33,7 +32,7 @@ def parse_uri(uri_as_string): - split_uri = urlparse.urlsplit(uri_as_string) + split_uri = urllib.parse.urlsplit(uri_as_string) assert split_uri.scheme == SCHEME uri_path = split_uri.netloc + split_uri.path diff --git a/smart_open/http.py b/smart_open/http.py index 2625cb1f..952cc360 100644 --- a/smart_open/http.py +++ b/smart_open/http.py @@ -10,8 +10,8 @@ import io import logging import os.path +import urllib.parse -from six.moves.urllib import parse as urlparse import requests from smart_open import bytebuffer, s3 @@ -33,7 +33,7 @@ def parse_uri(uri_as_string): - split_uri = urlparse.urlsplit(uri_as_string) + split_uri = urllib.parse.urlsplit(uri_as_string) assert split_uri.scheme in SCHEMES uri_path = split_uri.netloc + split_uri.path @@ -79,7 +79,7 @@ def open(uri, mode, kerberos=False, user=None, password=None, headers=None): uri, mode, kerberos=kerberos, user=user, password=password, headers=headers ) - fobj.name = os.path.basename(urlparse.urlparse(uri).path) + fobj.name = os.path.basename(urllib.parse.urlparse(uri).path) return fobj else: raise NotImplementedError('http support for mode %r not implemented' % mode) diff --git a/smart_open/s3.py b/smart_open/s3.py index 52887192..74444d3a 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -12,15 +12,13 @@ import functools import logging import time +import urllib.parse import warnings import boto import boto3 import botocore.client import botocore.exceptions -import six - -from six.moves.urllib import parse as urlparse import smart_open.bytebuffer import smart_open.utils @@ -98,8 +96,8 @@ def _safe_urlsplit(url): https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py https://github.com/RaRe-Technologies/smart_open/issues/285 """ - sr = urlparse.urlsplit(url.replace('?', '\n'), allow_fragments=False) - return urlparse.SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '', '') + sr = urllib.parse.urlsplit(url.replace('?', '\n'), allow_fragments=False) + return urllib.parse.SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '', '') def parse_uri(uri_as_string): @@ -1154,7 +1152,7 @@ def _download_fileobj(bucket, key_name): class DummyPool(object): """A class that mimics multiprocessing.pool.Pool for our purposes.""" def imap_unordered(self, function, items): - return six.moves.map(function, items) + return map(function, items) def terminate(self): pass diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 0654b745..f439d51d 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -22,13 +22,12 @@ import importlib import os import os.path as P +import pathlib +import urllib.parse import warnings import sys import boto3 -import six - -from six.moves.urllib import parse as urlparse # # This module defines a function called smart_open so we cannot use @@ -48,16 +47,6 @@ from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401 from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401 -# Import ``pathlib`` if the builtin ``pathlib`` or the backport ``pathlib2`` are -# available. The builtin ``pathlib`` will be imported with higher precedence. -for pathlib_module in ('pathlib', 'pathlib2'): - try: - pathlib = importlib.import_module(pathlib_module) - PATHLIB_SUPPORT = True - break - except ImportError: - PATHLIB_SUPPORT = False - logger = logging.getLogger(__name__) SYSTEM_ENCODING = sys.getdefaultencoding() @@ -78,7 +67,7 @@ def _sniff_scheme(uri_as_string): if os.name == 'nt' and '://' not in uri_as_string: uri_as_string = 'file://' + uri_as_string - return urlparse.urlsplit(uri_as_string).scheme + return urllib.parse.urlsplit(uri_as_string).scheme def parse_uri(uri_as_string): @@ -219,7 +208,7 @@ def open( """ logger.debug('%r', locals()) - if not isinstance(mode, six.string_types): + if not isinstance(mode, str): raise TypeError('mode should be a string') if transport_params is None: @@ -247,8 +236,7 @@ def open( if encoding is not None and 'b' in mode: mode = mode.replace('b', '') - # Support opening ``pathlib.Path`` objects by casting them to strings. - if PATHLIB_SUPPORT and isinstance(uri, pathlib.Path): + if isinstance(uri, pathlib.Path): uri = str(uri) explicit_encoding = encoding @@ -381,7 +369,7 @@ def _shortcut_open( :returns: The opened file :rtype: file """ - if not isinstance(uri, six.string_types): + if not isinstance(uri, str): return None scheme = _sniff_scheme(uri) @@ -405,17 +393,7 @@ def _shortcut_open( if errors and 'b' not in mode: open_kwargs['errors'] = errors - # - # Under Py3, the built-in open accepts kwargs, and it's OK to use that. - # Under Py2, the built-in open _doesn't_ accept kwargs, but we still use it - # whenever possible (see issue #207). If we're under Py2 and have to use - # kwargs, then we have no option other to use io.open. - # - if six.PY3: - return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) - elif not open_kwargs: - return _builtin_open(local_path, mode, buffering=buffering) - return io.open(local_path, mode, buffering=buffering, **open_kwargs) + return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) def _open_binary_stream(uri, mode, transport_params): @@ -447,7 +425,7 @@ def _open_binary_stream(uri, mode, transport_params): uri.name = getattr(uri, 'name', 'unknown') return uri - if not isinstance(uri, six.string_types): + if not isinstance(uri, str): raise TypeError("don't know how to handle uri %r" % uri) scheme = _sniff_scheme(uri) @@ -512,10 +490,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): def _patch_pathlib(func): """Replace `Path.open` with `func`""" - if not PATHLIB_SUPPORT: - raise RuntimeError('install pathlib (or pathlib2) before using this function') - if six.PY2: - raise RuntimeError('this monkey patch does not work on Py2') old_impl = pathlib.Path.open pathlib.Path.open = func return old_impl diff --git a/smart_open/ssh.py b/smart_open/ssh.py index cdb50d9d..4e3a01e1 100644 --- a/smart_open/ssh.py +++ b/smart_open/ssh.py @@ -24,10 +24,9 @@ import getpass import logging +import urllib.parse import warnings -from six.moves.urllib import parse as urlparse - import smart_open.utils logger = logging.getLogger(__name__) @@ -49,11 +48,11 @@ def _unquote(text): - return text and urlparse.unquote(text) + return text and urllib.parse.unquote(text) def parse_uri(uri_as_string): - split_uri = urlparse.urlsplit(uri_as_string) + split_uri = urllib.parse.urlsplit(uri_as_string) assert split_uri.scheme in SCHEMES return dict( scheme=split_uri.scheme, diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index db6fa021..22123cc7 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -22,7 +22,6 @@ import google.cloud import google.api_core.exceptions -import six import smart_open @@ -42,8 +41,6 @@ def ignore_resource_warnings(): - if six.PY2: - return warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") # noqa @@ -174,8 +171,8 @@ def exists(self, client=None): def upload_from_string(self, data): # mimics Google's API by accepting bytes or str, despite the method name # https://google-cloud-python.readthedocs.io/en/0.32.0/storage/blobs.html#google.cloud.storage.blob.Blob.upload_from_string - if isinstance(data, six.string_types): - data = bytes(data) if six.PY2 else bytes(data, 'utf8') + if isinstance(data, str): + data = bytes(data, 'utf8') self.__contents = io.BytesIO(data) self.__contents.seek(0, io.SEEK_END) diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index c5a7f7d0..18701321 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -18,7 +18,6 @@ import botocore.client import mock import moto -import six import smart_open import smart_open.s3 @@ -77,8 +76,6 @@ def ignore_resource_warnings(): # https://github.com/boto/boto3/issues/454 # Py2 doesn't have ResourceWarning, so do nothing. # - if six.PY2: - return warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") # noqa diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index c55be3e1..32e83bf8 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -19,7 +19,6 @@ from moto import mock_s3 import responses import gzip -import six import smart_open from smart_open import smart_open_lib @@ -288,7 +287,6 @@ def test_gs_uri_contains_slash(self): self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.blob_id, "mydir/myblob") - @unittest.skipUnless(smart_open_lib.six.PY3, "our monkey patch only works on Py3") def test_pathlib_monkeypatch(self): from smart_open.smart_open_lib import pathlib @@ -305,7 +303,6 @@ def test_pathlib_monkeypatch(self): _patch_pathlib(obj.old_impl) assert pathlib.Path.open != smart_open.open - @unittest.skipUnless(smart_open_lib.six.PY3, "our monkey patch only works on Py3") def test_pathlib_monkeypath_read_gz(self): from smart_open.smart_open_lib import pathlib @@ -325,11 +322,6 @@ def test_pathlib_monkeypath_read_gz(self): finally: _patch_pathlib(obj.old_impl) - @unittest.skipUnless(smart_open_lib.six.PY2, 'this test is for Py2 only') - def test_monkey_patch_raises_exception_py2(self): - with self.assertRaises(RuntimeError): - patch_pathlib() - class SmartOpenHttpTest(unittest.TestCase): """ @@ -399,7 +391,6 @@ def _test_compressed_http(self, suffix, query): # decompress the file and get the same md5 hash self.assertEqual(smart_open_object.read(), raw_data) - @unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet') def test_http_gz(self): """Can open gzip via http?""" self._test_compressed_http(".gz", False) @@ -408,7 +399,6 @@ def test_http_bz2(self): """Can open bzip2 via http?""" self._test_compressed_http(".bz2", False) - @unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet') def test_http_gz_query(self): """Can open gzip via http with a query appended to URI?""" self._test_compressed_http(".gz", True) @@ -418,7 +408,7 @@ def test_http_bz2_query(self): self._test_compressed_http(".bz2", True) -def make_buffer(cls=six.BytesIO, initial_value=None, name=None, noclose=False): +def make_buffer(cls=io.BytesIO, initial_value=None, name=None, noclose=False): """ Construct a new in-memory file object aka "buf". @@ -431,9 +421,6 @@ def make_buffer(cls=six.BytesIO, initial_value=None, name=None, noclose=False): buf = cls(initial_value) if initial_value else cls() if name is not None: buf.name = name - if six.PY2: - buf.__enter__ = lambda: buf - buf.__exit__ = lambda exc_type, exc_val, exc_tb: None if noclose: buf.close = lambda: None return buf @@ -515,7 +502,6 @@ def test_write_bytes(self): sf.write(SAMPLE_BYTES) self.assertEqual(buf.getvalue(), SAMPLE_BYTES) - @unittest.skipIf(six.PY2, "Python 2 does not differentiate between str and bytes") def test_read_text_stream_fails(self): """Attempts to read directly from a text stream should fail. @@ -523,14 +509,13 @@ def test_read_text_stream_fails(self): If you have a text stream, there's no point passing it to smart_open: you can read from it directly. """ - buf = make_buffer(six.StringIO, initial_value=SAMPLE_TEXT) + buf = make_buffer(io.StringIO, initial_value=SAMPLE_TEXT) with smart_open.smart_open(buf, 'r') as sf: self.assertRaises(TypeError, sf.read) # we expect binary mode - @unittest.skipIf(six.PY2, "Python 2 does not differentiate between str and bytes") def test_write_text_stream_fails(self): """Attempts to write directly to a text stream should fail.""" - buf = make_buffer(six.StringIO) + buf = make_buffer(io.StringIO) with smart_open.smart_open(buf, 'w') as sf: self.assertRaises(TypeError, sf.write, SAMPLE_TEXT) # we expect binary mode @@ -649,7 +634,6 @@ def test_open_with_keywords_explicit_r(self): actual = fin.read() self.assertEqual(expected, actual) - @unittest.skipUnless(smart_open_lib.PATHLIB_SUPPORT, "this test requires pathlib") def test_open_and_read_pathlib_path(self): """If ``pathlib.Path`` is available we should be able to open and read.""" from smart_open.smart_open_lib import pathlib @@ -762,7 +746,7 @@ def test_file(self, mock_smart_open): short_path = "~/tmp/test.txt" full_path = os.path.expanduser(short_path) - @mock.patch(_IO_OPEN if six.PY2 else _BUILTIN_OPEN) + @mock.patch(_BUILTIN_OPEN) def test_file_errors(self, mock_smart_open): prefix = "file://" full_path = '/tmp/test.txt' @@ -1003,8 +987,7 @@ def test_file_mode_mock(self): # def test_text(self): - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("blah", "r", encoding='utf-8') as fin: self.assertEqual(fin.read(), self.as_text) mock_open.assert_called_with("blah", "r", buffering=-1, encoding='utf-8') @@ -1033,22 +1016,19 @@ def test_incorrect(self): def test_write_utf8(self): # correct write mode, correct file:// URI - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("blah", "w", encoding='utf-8') as fout: mock_open.assert_called_with("blah", "w", buffering=-1, encoding='utf-8') fout.write(self.as_text) def test_write_utf8_absolute_path(self): - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("/some/file.txt", "w", encoding='utf-8') as fout: mock_open.assert_called_with("/some/file.txt", "w", buffering=-1, encoding='utf-8') fout.write(self.as_text) def test_append_utf8(self): - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("/some/file.txt", "w+", encoding='utf-8') as fout: mock_open.assert_called_with("/some/file.txt", "w+", buffering=-1, encoding='utf-8') fout.write(self.as_text) @@ -1326,29 +1306,13 @@ def cleanup_temp_bz2(self, test_file): os.unlink(test_file) def test_can_read_multistream_bz2(self): - if six.PY2: - # this is a backport from Python 3 - from bz2file import BZ2File - else: - from bz2 import BZ2File + from bz2 import BZ2File test_file = self.create_temp_bz2(streams=5) with BZ2File(test_file) as bz2f: self.assertEqual(bz2f.read(), self.TEXT * 5) self.cleanup_temp_bz2(test_file) - def test_python2_stdlib_bz2_cannot_read_multistream(self): - # Multistream bzip is included in Python 3 - if not six.PY2: - return - import bz2 - - test_file = self.create_temp_bz2(streams=5) - bz2f = bz2.BZ2File(test_file) - self.assertNotEqual(bz2f.read(), self.TEXT * 5) - bz2f.close() - self.cleanup_temp_bz2(test_file) - def test_file_smart_open_can_read_multistream_bz2(self): test_file = self.create_temp_bz2(streams=5) with smart_open_lib.smart_open(test_file) as bz2f: diff --git a/smart_open/tests/test_smart_open_old.py b/smart_open/tests/test_smart_open_old.py index 31fe50f8..7fa8c30e 100644 --- a/smart_open/tests/test_smart_open_old.py +++ b/smart_open/tests/test_smart_open_old.py @@ -25,7 +25,6 @@ from moto import mock_s3 import responses import gzip -import six import smart_open from smart_open import smart_open_lib @@ -70,7 +69,6 @@ def test_http_pass(self): self.assertTrue(actual_request.headers['Authorization'].startswith('Basic ')) @responses.activate - @unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet') def test_http_gz(self): """Can open gzip via http?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') @@ -88,7 +86,6 @@ def test_http_gz(self): self.assertEqual(m.hexdigest(), expected_hash) @responses.activate - @unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet') def test_http_gz_noquerystring(self): """Can open gzip via http?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') @@ -171,9 +168,6 @@ def test_open_with_keywords_explicit_r(self): actual = fin.read() self.assertEqual(expected, actual) - @unittest.skipUnless( - smart_open_lib.PATHLIB_SUPPORT, - "do not test pathlib support if pathlib or backport are not available") def test_open_and_read_pathlib_path(self): """If ``pathlib.Path`` is available we should be able to open and read.""" from smart_open.smart_open_lib import pathlib @@ -286,7 +280,7 @@ def test_file(self, mock_smart_open): short_path = "~/tmp/test.txt" full_path = os.path.expanduser(short_path) - @mock.patch(_IO_OPEN if six.PY2 else _BUILTIN_OPEN) + @mock.patch(_BUILTIN_OPEN) def test_file_errors(self, mock_smart_open): prefix = "file://" full_path = '/tmp/test.txt' @@ -546,8 +540,7 @@ def test_file_mode_mock(self): # def test_text(self): - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("blah", "r", encoding='utf-8') as fin: self.assertEqual(fin.read(), self.as_text) mock_open.assert_called_with("blah", "r", buffering=-1, encoding='utf-8') @@ -576,22 +569,19 @@ def test_incorrect(self): def test_write_utf8(self): # correct write mode, correct file:// URI - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("blah", "w", encoding='utf-8') as fout: mock_open.assert_called_with("blah", "w", buffering=-1, encoding='utf-8') fout.write(self.as_text) def test_write_utf8_absolute_path(self): - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("/some/file.txt", "w", encoding='utf-8') as fout: mock_open.assert_called_with("/some/file.txt", "w", buffering=-1, encoding='utf-8') fout.write(self.as_text) def test_append_utf8(self): - patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN - with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("/some/file.txt", "w+", encoding='utf-8') as fout: mock_open.assert_called_with("/some/file.txt", "w+", buffering=-1, encoding='utf-8') fout.write(self.as_text) @@ -973,7 +963,6 @@ def test_rw_gzip(self): with smart_open.smart_open(key, "rb") as fin: self.assertEqual(fin.read().decode("utf-8"), text) - @unittest.skipIf(six.PY2, 'this test does not work with Py2') @mock_s3 def test_gzip_write_mode(self): """Should always open in binary mode when writing through a codec.""" @@ -984,7 +973,6 @@ def test_gzip_write_mode(self): smart_open.smart_open("s3://bucket/key.gz", "wb") mock_open.assert_called_with('bucket', 'key.gz', 'wb') - @unittest.skipIf(six.PY2, 'this test does not work with Py2') @mock_s3 def test_gzip_read_mode(self): """Should always open in binary mode when reading through a codec.""" diff --git a/tox.ini b/tox.ini index 2b51b1b7..a9cf0a23 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] minversion = 2.0 -envlist = py{27,35,36,37}-{test,doctest,integration,benchmark}, sdist, flake8 +envlist = py{35,36,37}-{test,doctest,integration,benchmark}, sdist, flake8 [pytest] addopts = -rfxEXs --durations=20 --showlocals --reruns 3 --reruns-delay 1 From 98ded3539eff7b8dc6525bebcf181e386f11ff2e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 17:08:28 +0900 Subject: [PATCH 21/32] get rid of Py2, for real this time --- .travis.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index a53759ba..ad0597a6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,13 +8,6 @@ jobs: name: "flake8" env: TOXENV="flake8" - - python: '2.7' - env: - - SO_S3_URL: "s3://smart-open-py27-benchmark" - - SO_S3_RESULT_URL: "s3://smart-open-py27-benchmark-results" - - BOTO_CONFIG: "/dev/null" - - TOXENV: "check_keys,py27-test,py27-benchmark,py27-integration" - - python: '3.5' env: TOXENV="check_keys,py35-test,py35-integration" From 903bfd0ed996add33b615ca0a9a46d31e94a4366 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 17:12:07 +0900 Subject: [PATCH 22/32] get rid of unused imports --- smart_open/smart_open_lib.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index f439d51d..7b2a7bd6 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -18,8 +18,6 @@ import codecs import collections import logging -import io -import importlib import os import os.path as P import pathlib From f5dc67fba694444e20e6bf16ff1bc63463ed185b Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 17:29:53 +0900 Subject: [PATCH 23/32] still more Py2 removal --- setup.py | 5 +---- smart_open/compression.py | 25 +------------------------ smart_open/doctools.py | 9 --------- smart_open/gcs.py | 10 +++------- smart_open/tests/test_bytebuffer.py | 18 ++++++++++-------- smart_open/tests/test_hdfs.py | 3 --- smart_open/webhdfs.py | 16 ++++++---------- 7 files changed, 21 insertions(+), 65 deletions(-) diff --git a/setup.py b/setup.py index 13076e3e..cb3863ee 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,6 @@ import io import os -import sys from setuptools import setup, find_packages @@ -18,7 +17,7 @@ def _get_version(): curr_dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(curr_dir, 'smart_open', 'version.py')) as fin: # - # __version__ = '1.8.4' + # example: __version__ = '1.8.4' # line = fin.readline().strip() parts = line.split(' ') @@ -59,8 +58,6 @@ def read(fname): 'boto3', 'google-cloud-storage', ] -if sys.version_info[0] == 2: - install_requires.append('bz2file') setup( name='smart_open', diff --git a/smart_open/compression.py b/smart_open/compression.py index e57b0269..36fd8fb4 100644 --- a/smart_open/compression.py +++ b/smart_open/compression.py @@ -11,13 +11,10 @@ import os.path import warnings -import six - logger = logging.getLogger(__name__) _COMPRESSOR_REGISTRY = {} -_ISSUE_189_URL = 'https://github.com/RaRe-Technologies/smart_open/issues/189' def get_supported_extensions(): @@ -58,10 +55,7 @@ def register_compressor(ext, callback): def _handle_bz2(file_obj, mode): - if six.PY2: - from bz2file import BZ2File - else: - from bz2 import BZ2File + from bz2 import BZ2File return BZ2File(file_obj, mode) @@ -91,9 +85,6 @@ def compression_wrapper(file_obj, mode): ) return file_obj - if _need_to_buffer(file_obj, mode, ext): - warnings.warn('streaming gzip support unavailable, see %s' % _ISSUE_189_URL) - file_obj = io.BytesIO(file_obj.read()) if ext in _COMPRESSOR_REGISTRY and mode.endswith('+'): raise ValueError('transparent (de)compression unsupported for mode %r' % mode) @@ -105,20 +96,6 @@ def compression_wrapper(file_obj, mode): return callback(file_obj, mode) -def _need_to_buffer(file_obj, mode, ext): - """Returns True if we need to buffer the whole file in memory in order to proceed.""" - try: - is_seekable = file_obj.seekable() - except AttributeError: - # - # Under Py2, built-in file objects returned by open do not have - # .seekable, but have a .seek method instead. - # - is_seekable = hasattr(file_obj, 'seek') - is_compressed = ext in _COMPRESSOR_REGISTRY - return six.PY2 and mode.startswith('r') and is_compressed and not is_seekable - - # # NB. avoid using lambda here to make stack traces more readable. # diff --git a/smart_open/doctools.py b/smart_open/doctools.py index fdcbed6c..15801e83 100644 --- a/smart_open/doctools.py +++ b/smart_open/doctools.py @@ -16,8 +16,6 @@ import os.path import re -import six - from . import compression from . import transport @@ -168,13 +166,6 @@ def extract_examples_from_readme_rst(indent=' '): def tweak_docstrings(open_function, parse_uri_function): - # - # The code below doesn't work on Py2. We _could_ make it work, but given - # that it's 2020 and Py2 is on it's way out, I'm just going to disable it. - # - if six.PY2: - return - substrings = {} schemes = io.StringIO() seen_examples = set() diff --git a/smart_open/gcs.py b/smart_open/gcs.py index eec28493..b357e656 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -9,13 +9,12 @@ import io import logging +import urllib.parse import sys import google.cloud.exceptions import google.cloud.storage import google.auth.transport.requests as google_requests -import six -from six.moves.urllib import parse as urlparse import smart_open.bytebuffer import smart_open.s3 @@ -28,12 +27,9 @@ _MODES = (_READ_BINARY, _WRITE_BINARY) """Allowed I/O modes for working with GCS.""" -_BINARY_TYPES = (six.binary_type, bytearray) +_BINARY_TYPES = (bytes, bytearray, memoryview) """Allowed binary buffer types for writing to the underlying GCS stream""" -if sys.version_info >= (2, 7): - _BINARY_TYPES = (six.binary_type, bytearray, memoryview) - _BINARY_NEWLINE = b'\n' _UNKNOWN_FILE_SIZE = '*' @@ -106,7 +102,7 @@ def from_response(cls, response, part_num, content_length, total_size, headers): def parse_uri(uri_as_string): - sr = urlparse.urlsplit(uri_as_string) + sr = urllib.parse.urlsplit(uri_as_string) assert sr.scheme == SCHEME bucket_id = sr.netloc blob_id = sr.path.lstrip('/') diff --git a/smart_open/tests/test_bytebuffer.py b/smart_open/tests/test_bytebuffer.py index 7b0344d5..42aa1687 100644 --- a/smart_open/tests/test_bytebuffer.py +++ b/smart_open/tests/test_bytebuffer.py @@ -9,23 +9,25 @@ import random import unittest -import six - import smart_open.bytebuffer CHUNK_SIZE = 1024 +def int2byte(i): + return bytes((i, )) + + def random_byte_string(length=CHUNK_SIZE): - rand_bytes = [six.int2byte(random.randint(0, 255)) for _ in range(length)] + rand_bytes = [int2byte(random.randint(0, 255)) for _ in range(length)] return b''.join(rand_bytes) def bytebuffer_and_random_contents(): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE) - content_reader = six.BytesIO(contents) + content_reader = io.BytesIO(contents) buf.fill(content_reader) return [buf, contents] @@ -47,7 +49,7 @@ def test_len(self): def test_fill_from_reader(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE) - content_reader = six.BytesIO(contents) + content_reader = io.BytesIO(contents) bytes_filled = buf.fill(content_reader) self.assertEqual(bytes_filled, CHUNK_SIZE) @@ -77,7 +79,7 @@ def test_fill_from_list(self): def test_fill_multiple(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) long_contents = random_byte_string(CHUNK_SIZE * 4) - long_content_reader = six.BytesIO(long_contents) + long_content_reader = io.BytesIO(long_contents) first_bytes_filled = buf.fill(long_content_reader) self.assertEqual(first_bytes_filled, CHUNK_SIZE) @@ -89,7 +91,7 @@ def test_fill_multiple(self): def test_fill_size(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE * 2) - content_reader = six.BytesIO(contents) + content_reader = io.BytesIO(contents) fill_size = int(CHUNK_SIZE / 2) bytes_filled = buf.fill(content_reader, size=fill_size) @@ -105,7 +107,7 @@ def test_fill_reader_exhaustion(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) short_content_size = int(CHUNK_SIZE / 4) short_contents = random_byte_string(short_content_size) - short_content_reader = six.BytesIO(short_contents) + short_content_reader = io.BytesIO(short_contents) bytes_filled = buf.fill(short_content_reader) self.assertEqual(bytes_filled, short_content_size) diff --git a/smart_open/tests/test_hdfs.py b/smart_open/tests/test_hdfs.py index c0fb8aab..a28ab991 100644 --- a/smart_open/tests/test_hdfs.py +++ b/smart_open/tests/test_hdfs.py @@ -14,7 +14,6 @@ import unittest import mock -import six import smart_open.hdfs @@ -56,7 +55,6 @@ def test_read_100(self): expected = 'В начале июля, в чрезвычайно жаркое время' self.assertEqual(expected, as_text) - @unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet') def test_unzip(self): path = P.join(CURR_DIR, 'test_data/crime-and-punishment.txt.gz') cat = subprocess.Popen(['cat', path], stdout=subprocess.PIPE) @@ -93,7 +91,6 @@ def test_write(self): actual = cat.stdout.read().decode('utf-8') self.assertEqual(as_text, actual) - @unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet') def test_zip(self): cat = subprocess.Popen(['cat'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) as_text = 'мы в ответе за тех, кого приручили' diff --git a/smart_open/webhdfs.py b/smart_open/webhdfs.py index b05797ed..cfdac5f9 100644 --- a/smart_open/webhdfs.py +++ b/smart_open/webhdfs.py @@ -14,17 +14,13 @@ import io import logging +import urllib.parse import requests -import six -from six.moves.urllib import parse as urlparse from smart_open import utils -if six.PY2: - import httplib -else: - import http.client as httplib +import http.client as httplib logger = logging.getLogger(__name__) @@ -79,17 +75,17 @@ def _convert_to_http_uri(webhdfs_url): webhdfs_url: str A URL starting with webhdfs:// """ - split_uri = urlparse.urlsplit(webhdfs_url) + split_uri = urllib.parse.urlsplit(webhdfs_url) netloc = split_uri.hostname if split_uri.port: netloc += ":{}".format(split_uri.port) query = split_uri.query if split_uri.username: query += ( - ("&" if query else "") + "user.name=" + urlparse.quote(split_uri.username) + ("&" if query else "") + "user.name=" + urllib.parse.quote(split_uri.username) ) - return urlparse.urlunsplit( + return urllib.parse.urlunsplit( ("http", netloc, "/webhdfs/v1" + split_uri.path, query, "") ) @@ -233,7 +229,7 @@ def write(self, b): if self._closed: raise ValueError("I/O operation on closed file") - if not isinstance(b, six.binary_type): + if not isinstance(b, bytes): raise TypeError("input must be a binary string") self.lines.append(b) From b309d589ffbd1567fadb718be6fab9bb00ae2d1d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 21:49:21 +0900 Subject: [PATCH 24/32] remove unused imports --- smart_open/compression.py | 2 -- smart_open/gcs.py | 1 - 2 files changed, 3 deletions(-) diff --git a/smart_open/compression.py b/smart_open/compression.py index 36fd8fb4..d459761c 100644 --- a/smart_open/compression.py +++ b/smart_open/compression.py @@ -6,10 +6,8 @@ # from the MIT License (MIT). # """Implements the compression layer of the ``smart_open`` library.""" -import io import logging import os.path -import warnings logger = logging.getLogger(__name__) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index b357e656..b942f20f 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -10,7 +10,6 @@ import io import logging import urllib.parse -import sys import google.cloud.exceptions import google.cloud.storage From a936beaed7afd0f04771f3ad4eb46cd49a77b1ac Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 22:00:42 +0900 Subject: [PATCH 25/32] warn on missing docstrings --- smart_open/doctools.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/smart_open/doctools.py b/smart_open/doctools.py index 15801e83..bf9c3c7e 100644 --- a/smart_open/doctools.py +++ b/smart_open/doctools.py @@ -15,6 +15,7 @@ import io import os.path import re +import warnings from . import compression from . import transport @@ -166,6 +167,17 @@ def extract_examples_from_readme_rst(indent=' '): def tweak_docstrings(open_function, parse_uri_function): + # + # The docstring can be None if -OO was passed to the interpreter. + # + if not (open_function.__doc__ and parse_uri_function.__doc__): + warnings.warn( + 'docstrings for smart_open function are missing, ' + 'see https://github.com/RaRe-Technologies/smart_open' + '/blob/master/README.rst if you need documentation' + ) + return + substrings = {} schemes = io.StringIO() seen_examples = set() @@ -200,14 +212,8 @@ def tweak_docstrings(open_function, parse_uri_function): ) substrings['examples'] = extract_examples_from_readme_rst() - # - # The docstring can be None if -OO was passed to the interpreter. - # - if open_function.__doc__: - open_function.__doc__ = open_function.__doc__ % substrings - - if parse_uri_function.__doc__: - parse_uri_function.__doc__ = parse_uri_function.__doc__ % dict( - schemes=schemes.getvalue(), - uri_examples=uri_examples.getvalue(), - ) + open_function.__doc__ = open_function.__doc__ % substrings + parse_uri_function.__doc__ = parse_uri_function.__doc__ % dict( + schemes=schemes.getvalue(), + uri_examples=uri_examples.getvalue(), + ) From 0720cfc91eb3cc2ca8d5506d80e5d38513c9f977 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 22:02:16 +0900 Subject: [PATCH 26/32] docstring before and after newline --- smart_open/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/smart_open/utils.py b/smart_open/utils.py index 0eb9a922..5f01d164 100644 --- a/smart_open/utils.py +++ b/smart_open/utils.py @@ -5,7 +5,9 @@ # This code is distributed under the terms and conditions # from the MIT License (MIT). # + """Helper functions for documentation, etc.""" + import inspect import logging From caf5a4290b7dbcda73e4a5d8c05e96a891c8cb37 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 22:17:28 +0900 Subject: [PATCH 27/32] add doc links to submodules --- extending.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/extending.md b/extending.md index 5a1a296b..f0c7d08d 100644 --- a/extending.md +++ b/extending.md @@ -13,13 +13,13 @@ The first is by far the more challenging, and also the more welcome. Each transport mechanism lives in its own submodule. For example, currently we have: -- `smart_open.file` -- `smart_open.s3` -- `smart_open.ssh` +- [smart_open.local_file](smart_open/local_file.py) +- [smart_open.s3](smart_open/s3.py) +- [smart_open.ssh](smart_open/ssh.py) - ... and others So, to implement a new transport mechanism, you need to create a new module. -Your module must expose the following: +Your module must expose the following (see [smart_open.http](smart_open/http.py) for the full implementation): ```python SCHEMA = ... @@ -74,7 +74,7 @@ def open(..., mode, param1=None, param2=None, paramN=None): Have a look at the existing mechanisms to see how they work. You may define other functions and classes as necessary for your implementation. -Once your module is working, register it in the `smart_open/transport.py` file. +Once your module is working, register it in the [smart_open.transport](smart_open/transport.py) submodule. The `register_transport()` function updates a mapping from schemes to the modules that implement functionality for them. Once you've registered your new transport module, the following will happen automagically: From df7aee7f3bcd763fb136f0c6fa428ac69ee65877 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 22:17:40 +0900 Subject: [PATCH 28/32] remove useless comment in setup.py --- setup.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index cb3863ee..a33a24b3 100644 --- a/setup.py +++ b/setup.py @@ -16,14 +16,12 @@ def _get_version(): curr_dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(curr_dir, 'smart_open', 'version.py')) as fin: - # - # example: __version__ = '1.8.4' - # line = fin.readline().strip() parts = line.split(' ') + assert len(parts) == 3 assert parts[0] == '__version__' assert parts[1] == '=' - return parts[2][1:-1] + return parts[2].strip('\'"') # From c1be8deaf4abad08a447dd6be3436a11f4fecd08 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 27 Mar 2020 22:17:46 +0900 Subject: [PATCH 29/32] improve examples --- smart_open/ssh.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/smart_open/ssh.py b/smart_open/ssh.py index 4e3a01e1..75f2c8fa 100644 --- a/smart_open/ssh.py +++ b/smart_open/ssh.py @@ -42,8 +42,10 @@ DEFAULT_PORT = 22 URI_EXAMPLES = ( - '[ssh|scp|sftp]://username@host//path/file', - '[ssh|scp|sftp]://username@host/path/file', + 'ssh://username@host/path/file', + 'ssh://username@host//path/file', + 'scp://username@host/path/file', + 'sftp://username@host/path/file', ) From 0f6d5e49901a18ed38b3b56ae8f8d836a08188f1 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 28 Mar 2020 15:46:12 +0900 Subject: [PATCH 30/32] split out utils and constants submodules --- smart_open/constants.py | 26 +++++++++++++++++ smart_open/gcs.py | 45 +++++++++++------------------ smart_open/http.py | 18 ++++++------ smart_open/s3.py | 52 ++++++++++------------------------ smart_open/tests/test_gcs.py | 7 +++-- smart_open/tests/test_http.py | 7 +++-- smart_open/tests/test_s3.py | 13 ++------- smart_open/tests/test_utils.py | 22 ++++++++++++++ smart_open/utils.py | 48 +++++++++++++++++++++++++++++++ smart_open/webhdfs.py | 6 ++-- 10 files changed, 150 insertions(+), 94 deletions(-) create mode 100644 smart_open/constants.py create mode 100644 smart_open/tests/test_utils.py diff --git a/smart_open/constants.py b/smart_open/constants.py new file mode 100644 index 00000000..1ffa14e3 --- /dev/null +++ b/smart_open/constants.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2020 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# + +"""Some universal constants that are common to I/O operations.""" + + +READ_BINARY = 'rb' + +WRITE_BINARY = 'wb' + +BINARY_MODES = (READ_BINARY, WRITE_BINARY) + +BINARY_NEWLINE = b'\n' + +WHENCE_START = 0 + +WHENCE_CURRENT = 1 + +WHENCE_END = 2 + +WHENCE_CHOICES = (WHENCE_START, WHENCE_CURRENT, WHENCE_END) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index b942f20f..2aeb4e12 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -17,20 +17,15 @@ import smart_open.bytebuffer import smart_open.s3 +import smart_open.utils -logger = logging.getLogger(__name__) - -_READ_BINARY = 'rb' -_WRITE_BINARY = 'wb' +from smart_open import constants -_MODES = (_READ_BINARY, _WRITE_BINARY) -"""Allowed I/O modes for working with GCS.""" +logger = logging.getLogger(__name__) _BINARY_TYPES = (bytes, bytearray, memoryview) """Allowed binary buffer types for writing to the underlying GCS stream""" -_BINARY_NEWLINE = b'\n' - _UNKNOWN_FILE_SIZE = '*' SCHEME = "gs" @@ -45,22 +40,14 @@ DEFAULT_BUFFER_SIZE = 256 * 1024 """Default buffer size for working with GCS""" -START = 0 -"""Seek to the absolute start of a GCS file""" - -CURRENT = 1 -"""Seek relative to the current positive of a GCS file""" - -END = 2 -"""Seek relative to the end of a GCS file""" - -_WHENCE_CHOICES = (START, CURRENT, END) - _UPLOAD_INCOMPLETE_STATUS_CODE = 308 _UPLOAD_COMPLETE_STATUS_CODES = (200, 201) def _make_range_string(start, stop=None, end=_UNKNOWN_FILE_SIZE): + # + # GCS seems to violate RFC-2616 (see utils.make_range_string), so we + # need a separate implementation. # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#step_3upload_the_file_blocks # @@ -140,15 +127,15 @@ def open( The GCS client to use when working with google-cloud-storage. """ - if mode == _READ_BINARY: + if mode == constants.READ_BINARY: return SeekableBufferedInputBase( bucket_id, blob_id, buffer_size=buffer_size, - line_terminator=_BINARY_NEWLINE, + line_terminator=constants.BINARY_NEWLINE, client=client, ) - elif mode == _WRITE_BINARY: + elif mode == constants.WRITE_BINARY: return BufferedOutputBase( bucket_id, blob_id, @@ -214,7 +201,7 @@ def __init__( bucket, key, buffer_size=DEFAULT_BUFFER_SIZE, - line_terminator=_BINARY_NEWLINE, + line_terminator=constants.BINARY_NEWLINE, client=None, # type: google.cloud.storage.Client ): if client is None: @@ -266,7 +253,7 @@ def detach(self): """Unsupported.""" raise io.UnsupportedOperation - def seek(self, offset, whence=START): + def seek(self, offset, whence=constants.WHENCE_START): """Seek to the specified position. :param int offset: The offset in bytes. @@ -274,16 +261,16 @@ def seek(self, offset, whence=START): Returns the position after seeking.""" logger.debug('seeking to offset: %r whence: %r', offset, whence) - if whence not in _WHENCE_CHOICES: - raise ValueError('invalid whence, expected one of %r' % _WHENCE_CHOICES) + if whence not in constants.WHENCE_CHOICES: + raise ValueError('invalid whence, expected one of %r' % constants.WHENCE_CHOICES) - if whence == START: + if whence == constants.WHENCE_START: new_position = offset - elif whence == CURRENT: + elif whence == constants.WHENCE_CURRENT: new_position = self._current_pos + offset else: new_position = self._size + offset - new_position = smart_open.s3.clamp(new_position, 0, self._size) + new_position = smart_open.utils.clamp(new_position, 0, self._size) self._current_pos = new_position self._raw_reader.seek(new_position) logger.debug('current_pos: %r', self._current_pos) diff --git a/smart_open/http.py b/smart_open/http.py index 952cc360..975ec262 100644 --- a/smart_open/http.py +++ b/smart_open/http.py @@ -14,7 +14,7 @@ import requests -from smart_open import bytebuffer, s3 +from smart_open import bytebuffer, constants import smart_open.utils DEFAULT_BUFFER_SIZE = 128 * 1024 @@ -74,7 +74,7 @@ def open(uri, mode, kerberos=False, user=None, password=None, headers=None): unauthenticated, unless set separately in headers. """ - if mode == 'rb': + if mode == constants.READ_BINARY: fobj = SeekableBufferedInputBase( uri, mode, kerberos=kerberos, user=user, password=password, headers=headers @@ -253,20 +253,20 @@ def seek(self, offset, whence=0): Returns the position after seeking.""" logger.debug('seeking to offset: %r whence: %r', offset, whence) - if whence not in s3.WHENCE_CHOICES: - raise ValueError('invalid whence, expected one of %r' % s3.WHENCE_CHOICES) + if whence not in constants.WHENCE_CHOICES: + raise ValueError('invalid whence, expected one of %r' % constants.WHENCE_CHOICES) if not self.seekable(): raise OSError - if whence == s3.START: + if whence == constants.WHENCE_START: new_pos = offset - elif whence == s3.CURRENT: + elif whence == constants.WHENCE_CURRENT: new_pos = self._current_pos + offset - elif whence == s3.END: + elif whence == constants.WHENCE_END: new_pos = self.content_length + offset - new_pos = s3.clamp(new_pos, 0, self.content_length) + new_pos = smart_open.utils.clamp(new_pos, 0, self.content_length) if self._current_pos == new_pos: return self._current_pos @@ -302,7 +302,7 @@ def truncate(self, size=None): def _partial_request(self, start_pos=None): if start_pos is not None: - self.headers.update({"range": s3.make_range_string(start_pos)}) + self.headers.update({"range": smart_open.utils.make_range_string(start_pos)}) response = requests.get(self.url, auth=self.auth, stream=True, headers=self.headers) return response diff --git a/smart_open/s3.py b/smart_open/s3.py index 74444d3a..fdd58de3 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -23,6 +23,8 @@ import smart_open.bytebuffer import smart_open.utils +from smart_open import constants + logger = logging.getLogger(__name__) # AWS Lambda environments do not support multiprocessing.Queue or multiprocessing.Pool. @@ -50,12 +52,6 @@ """Default minimum part size for S3 multipart uploads""" MIN_MIN_PART_SIZE = 5 * 1024 ** 2 """The absolute minimum permitted by Amazon.""" -READ_BINARY = 'rb' -WRITE_BINARY = 'wb' -MODES = (READ_BINARY, WRITE_BINARY) -"""Allowed I/O modes for working with S3.""" - -BINARY_NEWLINE = b'\n' SCHEMES = ("s3", "s3n", 's3u', "s3a") DEFAULT_PORT = 443 @@ -63,11 +59,6 @@ DEFAULT_BUFFER_SIZE = 128 * 1024 -START = 0 -CURRENT = 1 -END = 2 -WHENCE_CHOICES = [START, CURRENT, END] - URI_EXAMPLES = ( 's3://my_bucket/my_key', 's3://my_key:my_secret@my_bucket/my_key', @@ -212,19 +203,6 @@ def _override_endpoint_url(transport_params, url): resource_kwargs.update(endpoint_url=url) -def clamp(value, minval, maxval): - return max(min(value, maxval), minval) - - -def make_range_string(start, stop=None): - # - # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 - # - if stop is None: - return 'bytes=%d-' % start - return 'bytes=%d-%d' % (start, stop) - - def open_uri(uri, mode, transport_params): parsed_uri = parse_uri(uri) parsed_uri, transport_params = _consolidate_params(parsed_uri, transport_params) @@ -286,13 +264,13 @@ def open( """ logger.debug('%r', locals()) - if mode not in MODES: - raise NotImplementedError('bad mode: %r expected one of %r' % (mode, MODES)) + if mode not in constants.BINARY_MODES: + raise NotImplementedError('bad mode: %r expected one of %r' % (mode, constants.BINARY_MODES)) - if (mode == WRITE_BINARY) and (version_id is not None): + if (mode == constants.WRITE_BINARY) and (version_id is not None): raise ValueError("version_id must be None when writing") - if mode == READ_BINARY: + if mode == constants.READ_BINARY: fileobj = Reader( bucket_id, key_id, @@ -302,7 +280,7 @@ def open( resource_kwargs=resource_kwargs, object_kwargs=object_kwargs, ) - elif mode == WRITE_BINARY: + elif mode == constants.WRITE_BINARY: if multipart_upload: fileobj = MultipartWriter( bucket_id, @@ -371,7 +349,7 @@ def seek(self, position): def _load_body(self): """Build a continuous connection with the remote peer starts from the current postion. """ - range_string = make_range_string(self._position) + range_string = smart_open.utils.make_range_string(self._position) logger.debug('content_length: %r range_string: %r', self._content_length, range_string) if self._position == self._content_length == 0 or self._position == self._content_length: @@ -419,7 +397,7 @@ class Reader(io.BufferedIOBase): Implements the io.BufferedIOBase interface of the standard library.""" def __init__(self, bucket, key, version_id=None, buffer_size=DEFAULT_BUFFER_SIZE, - line_terminator=BINARY_NEWLINE, session=None, resource_kwargs=None, + line_terminator=constants.BINARY_NEWLINE, session=None, resource_kwargs=None, object_kwargs=None): self._buffer_size = buffer_size @@ -537,7 +515,7 @@ def seekable(self): We offer only seek support, and no truncate support.""" return True - def seek(self, offset, whence=START): + def seek(self, offset, whence=constants.WHENCE_START): """Seek to the specified position. :param int offset: The offset in bytes. @@ -545,16 +523,16 @@ def seek(self, offset, whence=START): Returns the position after seeking.""" logger.debug('seeking to offset: %r whence: %r', offset, whence) - if whence not in WHENCE_CHOICES: - raise ValueError('invalid whence, expected one of %r' % WHENCE_CHOICES) + if whence not in constants.WHENCE_CHOICES: + raise ValueError('invalid whence, expected one of %r' % constants.WHENCE_CHOICES) - if whence == START: + if whence == constants.WHENCE_START: new_position = offset - elif whence == CURRENT: + elif whence == constants.WHENCE_CURRENT: new_position = self._current_pos + offset else: new_position = self._content_length + offset - new_position = clamp(new_position, 0, self._content_length) + new_position = smart_open.utils.clamp(new_position, 0, self._content_length) self._current_pos = new_position self._raw_reader.seek(new_position) logger.debug('new_position: %r', self._current_pos) diff --git a/smart_open/tests/test_gcs.py b/smart_open/tests/test_gcs.py index 22123cc7..3fcd5a07 100644 --- a/smart_open/tests/test_gcs.py +++ b/smart_open/tests/test_gcs.py @@ -24,6 +24,7 @@ import google.api_core.exceptions import smart_open +import smart_open.constants BUCKET_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) BLOB_NAME = 'test-blob' @@ -573,7 +574,7 @@ def test_seek_current(self): fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) self.assertEqual(fin.read(5), b'hello') - seek = fin.seek(1, whence=smart_open.gcs.CURRENT) + seek = fin.seek(1, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(seek, 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) @@ -583,7 +584,7 @@ def test_seek_end(self): put_to_bucket(contents=content) fin = smart_open.gcs.SeekableBufferedInputBase(BUCKET_NAME, BLOB_NAME) - seek = fin.seek(-4, whence=smart_open.gcs.END) + seek = fin.seek(-4, whence=smart_open.constants.WHENCE_END) self.assertEqual(seek, len(content) - 4) self.assertEqual(fin.read(), b'you?') @@ -595,7 +596,7 @@ def test_detect_eof(self): fin.read() eof = fin.tell() self.assertEqual(eof, len(content)) - fin.seek(0, whence=smart_open.gcs.END) + fin.seek(0, whence=smart_open.constants.WHENCE_END) self.assertEqual(eof, fin.tell()) def test_read_gzip(self): diff --git a/smart_open/tests/test_http.py b/smart_open/tests/test_http.py index e61e9dea..3527295d 100644 --- a/smart_open/tests/test_http.py +++ b/smart_open/tests/test_http.py @@ -11,6 +11,7 @@ import smart_open.http import smart_open.s3 +import smart_open.constants BYTES = b'i tried so hard and got so far but in the end it doesn\'t even matter' @@ -75,7 +76,7 @@ def test_seek_from_current(self): self.assertEqual(BYTES[10:20], read_bytes) self.assertEqual(reader.tell(), 20) - reader.seek(10, whence=smart_open.s3.CURRENT) + reader.seek(10, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(reader.tell(), 30) read_bytes = reader.read(size=10) self.assertEqual(reader.tell(), 40) @@ -86,7 +87,7 @@ def test_seek_from_end(self): responses.add_callback(responses.GET, URL, callback=request_callback) reader = smart_open.http.SeekableBufferedInputBase(URL) - reader.seek(-10, whence=smart_open.s3.END) + reader.seek(-10, whence=smart_open.constants.WHENCE_END) self.assertEqual(reader.tell(), len(BYTES) - 10) read_bytes = reader.read(size=10) self.assertEqual(reader.tell(), len(BYTES)) @@ -144,6 +145,6 @@ def test_https_seek_reverse(self): with smart_open.open(HTTPS_URL, "rb") as fin: read_bytes_1 = fin.read(size=10) - fin.seek(-10, whence=smart_open.s3.CURRENT) + fin.seek(-10, whence=smart_open.constants.WHENCE_CURRENT) read_bytes_2 = fin.read(size=10) self.assertEqual(read_bytes_1, read_bytes_2) diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index 18701321..10203034 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -180,7 +180,7 @@ def test_seek_current(self): fin = smart_open.s3.SeekableBufferedInputBase(BUCKET_NAME, KEY_NAME) self.assertEqual(fin.read(5), b'hello') - seek = fin.seek(1, whence=smart_open.s3.CURRENT) + seek = fin.seek(1, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(seek, 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) @@ -190,7 +190,7 @@ def test_seek_end(self): put_to_bucket(contents=content) fin = smart_open.s3.SeekableBufferedInputBase(BUCKET_NAME, KEY_NAME) - seek = fin.seek(-4, whence=smart_open.s3.END) + seek = fin.seek(-4, whence=smart_open.constants.WHENCE_END) self.assertEqual(seek, len(content) - 4) self.assertEqual(fin.read(), b'you?') @@ -202,7 +202,7 @@ def test_detect_eof(self): fin.read() eof = fin.tell() self.assertEqual(eof, len(content)) - fin.seek(0, whence=smart_open.s3.END) + fin.seek(0, whence=smart_open.constants.WHENCE_END) self.assertEqual(eof, fin.tell()) def test_read_gzip(self): @@ -522,13 +522,6 @@ def test_flush_close(self): fout.close() -class ClampTest(unittest.TestCase): - def test(self): - self.assertEqual(smart_open.s3.clamp(5, 0, 10), 5) - self.assertEqual(smart_open.s3.clamp(11, 0, 10), 10) - self.assertEqual(smart_open.s3.clamp(-1, 0, 10), 0) - - ARBITRARY_CLIENT_ERROR = botocore.client.ClientError(error_response={}, operation_name='bar') diff --git a/smart_open/tests/test_utils.py b/smart_open/tests/test_utils.py new file mode 100644 index 00000000..63b463d8 --- /dev/null +++ b/smart_open/tests/test_utils.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2019 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# + +import unittest + +import smart_open.utils + + +class ClampTest(unittest.TestCase): + def test_low(self): + self.assertEqual(smart_open.utils.clamp(5, 0, 10), 5) + + def test_high(self): + self.assertEqual(smart_open.utils.clamp(11, 0, 10), 10) + + def test_out_of_range(self): + self.assertEqual(smart_open.utils.clamp(-1, 0, 10), 0) diff --git a/smart_open/utils.py b/smart_open/utils.py index 5f01d164..dfd44ecb 100644 --- a/smart_open/utils.py +++ b/smart_open/utils.py @@ -68,3 +68,51 @@ def check_kwargs(kallable, kwargs): logger.warning('ignoring unsupported keyword arguments: %r', unsupported_keywords) return supported_kwargs + + +def clamp(value, minval, maxval): + """Clamp a numeric value to a specific range. + + Parameters + ---------- + value: numeric + The value to clamp. + + minval: numeric + The lower bound. + + maxval: numeric + The upper bound. + + Returns + ------- + numeric + The clamped value. It will be in the range ``[minval, maxval]``. + + """ + return max(min(value, maxval), minval) + + +def make_range_string(start, stop=None): + """Create a byte range specifier in accordance with RFC-2616. + + Parameters + ---------- + start: int + The start of the byte range + + stop: int, optional + The end of the byte range. If unspecified, indicates EOF. + + Returns + ------- + str + A byte range specifier. + + """ + # + # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 + # + if stop is None: + return 'bytes=%d-' % start + return 'bytes=%d-%d' % (start, stop) diff --git a/smart_open/webhdfs.py b/smart_open/webhdfs.py index cfdac5f9..d9785ff0 100644 --- a/smart_open/webhdfs.py +++ b/smart_open/webhdfs.py @@ -18,7 +18,7 @@ import requests -from smart_open import utils +from smart_open import utils, constants import http.client as httplib @@ -55,9 +55,9 @@ def open(http_uri, mode, min_part_size=MIN_PART_SIZE): if http_uri.startswith(SCHEME): http_uri = _convert_to_http_uri(http_uri) - if mode == 'rb': + if mode == constants.READ_BINARY: fobj = BufferedInputBase(http_uri) - elif mode == 'wb': + elif mode == constants.WRITE_BINARY: fobj = BufferedOutputBase(http_uri, min_part_size=min_part_size) else: raise NotImplementedError("webhdfs support for mode %r not implemented" % mode) From 6d7a73a811473d32e03e50a280929794ebb433ac Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 28 Mar 2020 16:16:47 +0900 Subject: [PATCH 31/32] split out concurrency submodule --- smart_open/concurrency.py | 76 +++++++++++++++++++++++++++++++++++++ smart_open/gcs.py | 1 - smart_open/s3.py | 64 +------------------------------ smart_open/tests/test_s3.py | 28 +++++++------- 4 files changed, 92 insertions(+), 77 deletions(-) create mode 100644 smart_open/concurrency.py diff --git a/smart_open/concurrency.py b/smart_open/concurrency.py new file mode 100644 index 00000000..4e72aec7 --- /dev/null +++ b/smart_open/concurrency.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2020 Radim Rehurek +# +# This code is distributed under the terms and conditions +# from the MIT License (MIT). +# + +"""Common functionality for concurrent processing. + +The main entry point is :func:`create_pool`. +""" + +import contextlib +import logging +import warnings + +logger = logging.getLogger(__name__) + +# AWS Lambda environments do not support multiprocessing.Queue or multiprocessing.Pool. +# However they do support Threads and therefore concurrent.futures's ThreadPoolExecutor. +# We use this flag to allow python 2 backward compatibility, where concurrent.futures doesn't exist. +_CONCURRENT_FUTURES = False +try: + import concurrent.futures + _CONCURRENT_FUTURES = True +except ImportError: + warnings.warn("concurrent.futures could not be imported and won't be used") + +# Multiprocessing is unavailable in App Engine (and possibly other sandboxes). +# The only method currently relying on it is iter_bucket, which is instructed +# whether to use it by the MULTIPROCESSING flag. +_MULTIPROCESSING = False +try: + import multiprocessing.pool + _MULTIPROCESSING = True +except ImportError: + warnings.warn("multiprocessing could not be imported and won't be used") + + +class DummyPool(object): + """A class that mimics multiprocessing.pool.Pool for our purposes.""" + def imap_unordered(self, function, items): + return map(function, items) + + def terminate(self): + pass + + +class ConcurrentFuturesPool(object): + """A class that mimics multiprocessing.pool.Pool but uses concurrent futures instead of processes.""" + def __init__(self, max_workers): + self.executor = concurrent.futures.ThreadPoolExecutor(max_workers) + + def imap_unordered(self, function, items): + futures = [self.executor.submit(function, item) for item in items] + for future in concurrent.futures.as_completed(futures): + yield future.result() + + def terminate(self): + self.executor.shutdown(wait=True) + + +@contextlib.contextmanager +def create_pool(processes=1): + if _MULTIPROCESSING and processes: + logger.info("creating multiprocessing pool with %i workers", processes) + pool = multiprocessing.pool.Pool(processes=processes) + elif _CONCURRENT_FUTURES and processes: + logger.info("creating concurrent futures pool with %i workers", processes) + pool = ConcurrentFuturesPool(max_workers=processes) + else: + logger.info("creating dummy pool") + pool = DummyPool() + yield pool + pool.terminate() diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 2aeb4e12..9583fc7c 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -16,7 +16,6 @@ import google.auth.transport.requests as google_requests import smart_open.bytebuffer -import smart_open.s3 import smart_open.utils from smart_open import constants diff --git a/smart_open/s3.py b/smart_open/s3.py index fdd58de3..faa6ed20 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -8,12 +8,10 @@ """Implements file-like objects for reading and writing from/to S3.""" import io -import contextlib import functools import logging import time import urllib.parse -import warnings import boto import boto3 @@ -21,33 +19,13 @@ import botocore.exceptions import smart_open.bytebuffer +import smart_open.concurrency import smart_open.utils from smart_open import constants logger = logging.getLogger(__name__) -# AWS Lambda environments do not support multiprocessing.Queue or multiprocessing.Pool. -# However they do support Threads and therefore concurrent.futures's ThreadPoolExecutor. -# We use this flag to allow python 2 backward compatibility, where concurrent.futures doesn't exist. -_CONCURRENT_FUTURES = False -try: - import concurrent.futures - _CONCURRENT_FUTURES = True -except ImportError: - warnings.warn("concurrent.futures could not be imported and won't be used") - -# Multiprocessing is unavailable in App Engine (and possibly other sandboxes). -# The only method currently relying on it is iter_bucket, which is instructed -# whether to use it by the MULTIPROCESSING flag. -_MULTIPROCESSING = False -try: - import multiprocessing.pool - _MULTIPROCESSING = True -except ImportError: - warnings.warn("multiprocessing could not be imported and won't be used") - - DEFAULT_MIN_PART_SIZE = 50 * 1024**2 """Default minimum part size for S3 multipart uploads""" MIN_MIN_PART_SIZE = 5 * 1024 ** 2 @@ -1042,7 +1020,7 @@ def iter_bucket( retries=retries, **session_kwargs) - with _create_process_pool(processes=workers) as pool: + with smart_open.concurrency.create_pool(processes=workers) as pool: result_iterator = pool.imap_unordered(download_key, key_iterator) for key_no, (key, content) in enumerate(result_iterator): if True or key_no % 1000 == 0: @@ -1125,41 +1103,3 @@ def _download_fileobj(bucket, key_name): buf = io.BytesIO() bucket.download_fileobj(key_name, buf) return buf.getvalue() - - -class DummyPool(object): - """A class that mimics multiprocessing.pool.Pool for our purposes.""" - def imap_unordered(self, function, items): - return map(function, items) - - def terminate(self): - pass - - -class ConcurrentFuturesPool(object): - """A class that mimics multiprocessing.pool.Pool but uses concurrent futures instead of processes.""" - def __init__(self, max_workers): - self.executor = concurrent.futures.ThreadPoolExecutor(max_workers) - - def imap_unordered(self, function, items): - futures = [self.executor.submit(function, item) for item in items] - for future in concurrent.futures.as_completed(futures): - yield future.result() - - def terminate(self): - self.executor.shutdown(wait=True) - - -@contextlib.contextmanager -def _create_process_pool(processes=1): - if _MULTIPROCESSING and processes: - logger.info("creating multiprocessing pool with %i workers", processes) - pool = multiprocessing.pool.Pool(processes=processes) - elif _CONCURRENT_FUTURES and processes: - logger.info("creating concurrent futures pool with %i workers", processes) - pool = ConcurrentFuturesPool(max_workers=processes) - else: - logger.info("creating dummy pool") - pool = DummyPool() - yield pool - pool.terminate() diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index 10203034..223098a1 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -608,15 +608,15 @@ def test_old(self): @moto.mock_s3 -@unittest.skipIf(not smart_open.s3._CONCURRENT_FUTURES, 'concurrent.futures unavailable') +@unittest.skipIf(not smart_open.concurrency._CONCURRENT_FUTURES, 'concurrent.futures unavailable') class IterBucketConcurrentFuturesTest(unittest.TestCase): def setUp(self): - self.old_flag_multi = smart_open.s3._MULTIPROCESSING - smart_open.s3._MULTIPROCESSING = False + self.old_flag_multi = smart_open.concurrency._MULTIPROCESSING + smart_open.concurrency._MULTIPROCESSING = False ignore_resource_warnings() def tearDown(self): - smart_open.s3._MULTIPROCESSING = self.old_flag_multi + smart_open.concurrency._MULTIPROCESSING = self.old_flag_multi cleanup_bucket() def test(self): @@ -630,15 +630,15 @@ def test(self): @moto.mock_s3 -@unittest.skipIf(not smart_open.s3._MULTIPROCESSING, 'multiprocessing unavailable') +@unittest.skipIf(not smart_open.concurrency._MULTIPROCESSING, 'multiprocessing unavailable') class IterBucketMultiprocessingTest(unittest.TestCase): def setUp(self): - self.old_flag_concurrent = smart_open.s3._CONCURRENT_FUTURES - smart_open.s3._CONCURRENT_FUTURES = False + self.old_flag_concurrent = smart_open.concurrency._CONCURRENT_FUTURES + smart_open.concurrency._CONCURRENT_FUTURES = False ignore_resource_warnings() def tearDown(self): - smart_open.s3._CONCURRENT_FUTURES = self.old_flag_concurrent + smart_open.concurrency._CONCURRENT_FUTURES = self.old_flag_concurrent cleanup_bucket() def test(self): @@ -654,16 +654,16 @@ def test(self): @moto.mock_s3 class IterBucketSingleProcessTest(unittest.TestCase): def setUp(self): - self.old_flag_multi = smart_open.s3._MULTIPROCESSING - self.old_flag_concurrent = smart_open.s3._CONCURRENT_FUTURES - smart_open.s3._MULTIPROCESSING = False - smart_open.s3._CONCURRENT_FUTURES = False + self.old_flag_multi = smart_open.concurrency._MULTIPROCESSING + self.old_flag_concurrent = smart_open.concurrency._CONCURRENT_FUTURES + smart_open.concurrency._MULTIPROCESSING = False + smart_open.concurrency._CONCURRENT_FUTURES = False ignore_resource_warnings() def tearDown(self): - smart_open.s3._MULTIPROCESSING = self.old_flag_multi - smart_open.s3._CONCURRENT_FUTURES = self.old_flag_concurrent + smart_open.concurrency._MULTIPROCESSING = self.old_flag_multi + smart_open.concurrency._CONCURRENT_FUTURES = self.old_flag_concurrent cleanup_bucket() def test(self): From f7a4df0b8b17b923c11fe378c4bdefc69383e908 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 29 Mar 2020 16:38:31 +0900 Subject: [PATCH 32/32] update extending.md --- extending.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/extending.md b/extending.md index f0c7d08d..b205b2b7 100644 --- a/extending.md +++ b/extending.md @@ -130,3 +130,13 @@ register_compressor('.xz', _handle_xz) There are many compression formats out there, and supporting all of them is beyond the scope of `smart_open`. We want our code's functionality to cover the bare minimum required to satisfy 80% of our users. We leave the remaining 20% of users with the ability to deal with compression in their own code, using the trivial mechanism described above. + +Documentation +------------- + +Once you've contributed your extension, please add it to the documentation so that it is discoverable for other users. +Some notable files: + +- setup.py: See the `description` keyword. Not all contributions will affect this. +- README.rst +- howto.md (if your extension solves a specific problem that doesn't get covered by other documentation)