diff --git a/help.txt b/help.txt index f8c3a75c..9bc8ca43 100644 --- a/help.txt +++ b/help.txt @@ -13,19 +13,26 @@ DESCRIPTION The main functions are: * `open()`, which opens the given file for reading/writing + * `parse_uri()` * `s3_iter_bucket()`, which goes over all keys in an S3 bucket in parallel * `register_compressor()`, which registers callbacks for transparent compressor handling PACKAGE CONTENTS bytebuffer + compression + concurrency + constants doctools gcs hdfs http + local_file s3 smart_open_lib ssh tests (package) + transport + utils version webhdfs @@ -33,27 +40,14 @@ FUNCTIONS open(uri, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None, ignore_ext=False, transport_params=None) Open the URI object, returning a file-like object. - The URI is usually a string in a variety of formats: - - 1. a URI for the local filesystem: `./lines.txt`, `/home/joe/lines.txt.gz`, - `file:///home/joe/lines.txt.bz2` - 2. a URI for HDFS: `hdfs:///some/path/lines.txt` - 3. a URI for Amazon's S3 (can also supply credentials inside the URI): - `s3://my_bucket/lines.txt`, `s3://my_aws_key_id:key_secret@my_bucket/lines.txt` + The URI is usually a string in a variety of formats. + For a full list of examples, see the :func:`parse_uri` function. The URI may also be one of: - an instance of the pathlib.Path class - a stream (anything that implements io.IOBase-like functionality) - This function supports transparent compression and decompression using the - following codec: - - - ``.gz`` - - ``.bz2`` - - The function depends on the file extension to determine the appropriate codec. - Parameters ---------- uri: str or object @@ -89,7 +83,45 @@ FUNCTIONS by the transport layer being used, smart_open will ignore that argument and log a warning message. - S3 (for details, see :mod:`smart_open.s3` and :func:`smart_open.s3.open`): + smart_open supports the following transport mechanisms: + + file (smart_open/local_file.py) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Implements the transport for the file:// schema. + + gs (smart_open/gcs.py) + ~~~~~~~~~~~~~~~~~~~~~~ + Implements file-like objects for reading and writing to/from GCS. + + buffer_size: int, optional + The buffer size to use when performing I/O. For reading only. + min_part_size: int, optional + The minimum part size for multipart uploads. For writing only. + client: google.cloud.storage.Client, optional + The GCS client to use when working with google-cloud-storage. + + hdfs (smart_open/hdfs.py) + ~~~~~~~~~~~~~~~~~~~~~~~~~ + Implements reading and writing to/from HDFS. + + http (smart_open/http.py) + ~~~~~~~~~~~~~~~~~~~~~~~~~ + Implements file-like objects for reading from http. + + kerberos: boolean, optional + If True, will attempt to use the local Kerberos credentials + user: str, optional + The username for authenticating over HTTP + password: str, optional + The password for authenticating over HTTP + headers: dict, optional + Any headers to send in the request. If ``None``, the default headers are sent: + ``{'Accept-Encoding': 'identity'}``. To use no headers at all, + set this variable to an empty dict, ``{}``. + + s3 (smart_open/s3.py) + ~~~~~~~~~~~~~~~~~~~~~ + Implements file-like objects for reading and writing from/to AWS S3. buffer_size: int, optional The buffer size to use when performing I/O. @@ -119,25 +151,9 @@ FUNCTIONS Additional parameters to pass to boto3's object.get function. Used during reading only. - HTTP (for details, see :mod:`smart_open.http` and :func:`smart_open.http.open`): - - kerberos: boolean, optional - If True, will attempt to use the local Kerberos credentials - user: str, optional - The username for authenticating over HTTP - password: str, optional - The password for authenticating over HTTP - headers: dict, optional - Any headers to send in the request. If ``None``, the default headers are sent: - ``{'Accept-Encoding': 'identity'}``. To use no headers at all, - set this variable to an empty dict, ``{}``. - - WebHDFS (for details, see :mod:`smart_open.webhdfs` and :func:`smart_open.webhdfs.open`): - - min_part_size: int, optional - For writing only. - - SSH (for details, see :mod:`smart_open.ssh` and :func:`smart_open.ssh.open`): + scp (smart_open/ssh.py) + ~~~~~~~~~~~~~~~~~~~~~~~ + Implements I/O streams over SSH. mode: str, optional The mode to use for opening the file. @@ -153,9 +169,16 @@ FUNCTIONS transport_params: dict, optional Any additional settings to be passed to paramiko.SSHClient.connect + webhdfs (smart_open/webhdfs.py) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Implements reading and writing to/from WebHDFS. + + min_part_size: int, optional + For writing only. Examples -------- + >>> from smart_open import open >>> >>> # stream lines from an S3 object @@ -192,25 +215,14 @@ FUNCTIONS >>> for line in open('http://example.com/index.html'): ... print(repr(line)) ... break - '\n' - - Other examples of URLs that ``smart_open`` accepts:: - - s3://my_bucket/my_key - s3://my_key:my_secret@my_bucket/my_key - s3://my_key:my_secret@my_server:my_port@my_bucket/my_key - gs://my_bucket/my_blob - hdfs:///path/file - hdfs://path/file - webhdfs://host:port/path/file - ./local/path/file - ~/local/path/file - local/path/file - ./local/path/file.gz - file:///home/user/file - file:///home/user/file.bz2 - [ssh|scp|sftp]://username@host//path/file - [ssh|scp|sftp]://username@host/path/file + + This function also supports transparent compression and decompression + using the following codecs: + + * .bz2 + * .gz + + The function depends on the file extension to determine the appropriate codec. See Also @@ -219,20 +231,66 @@ FUNCTIONS - `smart_open README.rst `__ + parse_uri(uri_as_string) + Parse the given URI from a string. + + Parameters + ---------- + uri_as_string: str + The URI to parse. + + Returns + ------- + collections.namedtuple + The parsed URI. + + Notes + ----- + Supported URI schemes are: + + * file + * gs + * hdfs + * http + * s3 + * scp + * webhdfs + + Valid URI examples:: + + * ./local/path/file + * ~/local/path/file + * local/path/file + * ./local/path/file.gz + * file:///home/user/file + * file:///home/user/file.bz2 + * hdfs:///path/file + * hdfs://path/file + * s3://my_bucket/my_key + * s3://my_key:my_secret@my_bucket/my_key + * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key + * ssh://username@host/path/file + * ssh://username@host//path/file + * scp://username@host/path/file + * sftp://username@host/path/file + * webhdfs://host:port/path/file + register_compressor(ext, callback) Register a callback for transparently decompressing files with a specific extension. Parameters ---------- ext: str - The extension. + The extension. Must include the leading period, e.g. ``.gz``. callback: callable The callback. It must accept two position arguments, file_obj and mode. + This function will be called when ``smart_open`` is opening a file with + the specified extension. Examples -------- - Instruct smart_open to use the identity function whenever opening a file + Instruct smart_open to use the `lzma` module whenever opening a file with a .xz extension (see README.rst for the complete example showing I/O): >>> def _handle_xz(file_obj, mode): @@ -295,12 +353,12 @@ FUNCTIONS smart_open(uri, mode='rb', **kw) DATA - __all__ = ['open', 'smart_open', 's3_iter_bucket', 'register_compresso... + __all__ = ['open', 'parse_uri', 'register_compressor', 's3_iter_bucket... VERSION 1.10.0 FILE - /home/misha/git/smart_open/smart_open/__init__.py + /Users/misha/git/smart_open/smart_open/__init__.py diff --git a/smart_open/doctools.py b/smart_open/doctools.py index bf9c3c7e..0498af57 100644 --- a/smart_open/doctools.py +++ b/smart_open/doctools.py @@ -11,15 +11,17 @@ For internal use only. """ +import contextlib import inspect import io import os.path import re -import warnings from . import compression from . import transport +PLACEHOLDER = ' smart_open/doctools.py magic goes here' + def extract_kwargs(docstring): """Extract keyword argument documentation from a function's docstring. @@ -166,54 +168,78 @@ def extract_examples_from_readme_rst(indent=' '): return indent + 'See README.rst' -def tweak_docstrings(open_function, parse_uri_function): +def tweak_open_docstring(f): + buf = io.StringIO() + seen = set() + + root_path = os.path.dirname(os.path.dirname(__file__)) + + with contextlib.redirect_stdout(buf): + print(' smart_open supports the following transport mechanisms:') + print() + for scheme, submodule in sorted(transport._REGISTRY.items()): + if scheme == transport.NO_SCHEME or submodule in seen: + continue + seen.add(submodule) + + relpath = os.path.relpath(submodule.__file__, start=root_path) + heading = '%s (%s)' % (scheme, relpath) + print(' %s' % heading) + print(' %s' % ('~' * len(heading))) + print(' %s' % submodule.__doc__.split('\n')[0]) + print() + + kwargs = extract_kwargs(submodule.open.__doc__) + if kwargs: + print(to_docstring(kwargs, lpad=u' ')) + + print(' Examples') + print(' --------') + print() + print(extract_examples_from_readme_rst()) + + print(' This function also supports transparent compression and decompression ') + print(' using the following codecs:') + print() + for extension in compression.get_supported_extensions(): + print(' * %s' % extension) + print() + print(' The function depends on the file extension to determine the appropriate codec.') + # # The docstring can be None if -OO was passed to the interpreter. # - if not (open_function.__doc__ and parse_uri_function.__doc__): - warnings.warn( - 'docstrings for smart_open function are missing, ' - 'see https://github.com/RaRe-Technologies/smart_open' - '/blob/master/README.rst if you need documentation' - ) - return - - substrings = {} - schemes = io.StringIO() - seen_examples = set() - uri_examples = io.StringIO() + if f.__doc__: + f.__doc__ = f.__doc__.replace(PLACEHOLDER, buf.getvalue()) - for scheme, submodule in sorted(transport._REGISTRY.items()): - if scheme == transport.NO_SCHEME: - continue - schemes.write(' * %s\n' % scheme) +def tweak_parse_uri_docstring(f): + buf = io.StringIO() + seen = set() + schemes = [] + examples = [] - try: - fn = submodule.open - except AttributeError: - substrings[scheme] = '' - else: - kwargs = extract_kwargs(fn.__doc__) - substrings[scheme] = to_docstring(kwargs, lpad=u' ') + for scheme, submodule in sorted(transport._REGISTRY.items()): + if scheme == transport.NO_SCHEME or submodule in seen: + continue + schemes.append(scheme) + seen.add(submodule) try: - examples = submodule.URI_EXAMPLES + examples.extend(submodule.URI_EXAMPLES) except AttributeError: - continue - else: - for e in examples: - if e not in seen_examples: - uri_examples.write(' * %s\n' % e) - seen_examples.add(e) - - substrings['codecs'] = '\n'.join( - [' * %s' % e for e in compression.get_supported_extensions()] - ) - substrings['examples'] = extract_examples_from_readme_rst() - - open_function.__doc__ = open_function.__doc__ % substrings - parse_uri_function.__doc__ = parse_uri_function.__doc__ % dict( - schemes=schemes.getvalue(), - uri_examples=uri_examples.getvalue(), - ) + pass + + with contextlib.redirect_stdout(buf): + print(' Supported URI schemes are:') + print() + for scheme in schemes: + print(' * %s' % scheme) + print() + print(' Valid URI examples::') + print() + for example in examples: + print(' * %s' % example) + + if f.__doc__: + f.__doc__ = f.__doc__.replace(PLACEHOLDER, buf.getvalue()) diff --git a/smart_open/s3.py b/smart_open/s3.py index a99bcc55..c583fffa 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -5,7 +5,7 @@ # This code is distributed under the terms and conditions # from the MIT License (MIT). # -"""Implements file-like objects for reading and writing from/to S3.""" +"""Implements file-like objects for reading and writing from/to AWS S3.""" import io import functools diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 7b2a7bd6..02f3b18f 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -84,17 +84,7 @@ def parse_uri(uri_as_string): Notes ----- - - Supported URI schemes are: - -%(schemes)s - s3, s3a and s3n are treated the same way. s3u is s3 but without SSL. - - Valid URI examples:: - -%(uri_examples)s - - + smart_open/doctools.py magic goes here """ scheme = _sniff_scheme(uri_as_string) submodule = transport.get_transport(scheme) @@ -138,13 +128,6 @@ def open( - an instance of the pathlib.Path class - a stream (anything that implements io.IOBase-like functionality) - This function supports transparent compression and decompression using the - following codecs: - -%(codecs)s - - The function depends on the file extension to determine the appropriate codec. - Parameters ---------- uri: str or object @@ -180,22 +163,7 @@ def open( by the transport layer being used, smart_open will ignore that argument and log a warning message. - S3 (for details, see :mod:`smart_open.s3` and :func:`smart_open.s3.open`): - -%(s3)s - HTTP (for details, see :mod:`smart_open.http` and :func:`smart_open.http.open`): - -%(http)s - WebHDFS (for details, see :mod:`smart_open.webhdfs` and :func:`smart_open.webhdfs.open`): - -%(webhdfs)s - SSH (for details, see :mod:`smart_open.ssh` and :func:`smart_open.ssh.open`): - -%(ssh)s - - Examples - -------- -%(examples)s + smart_open/doctools.py magic goes here See Also -------- @@ -493,4 +461,19 @@ def _patch_pathlib(func): return old_impl -doctools.tweak_docstrings(open, parse_uri) +# +# Prevent failures with doctools from messing up the entire library. We don't +# expect such failures, but contributed modules (e.g. new transport mechanisms) +# may not be as polished. +# +try: + doctools.tweak_open_docstring(open) + doctools.tweak_parse_uri_docstring(parse_uri) +except Exception as ex: + logger.error( + 'Encountered a non-fatal error while building docstrings (see below). ' + 'help(smart_open) will provide incomplete information as a result. ' + 'For full help text, see ' + '.' + ) + logger.exception(ex)