Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for pathlib.Path. Fix #170 #175

Merged
merged 9 commits into from
Mar 22, 2018
16 changes: 16 additions & 0 deletions smart_open/smart_open_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,20 @@
import subprocess
import sys
import requests
import importlib
import io
import warnings

# Import ``pathlib`` if the builtin ``pathlib`` or the backport ``pathlib2`` are
# available. The builtin ``pathlib`` will be imported with higher precedence.
for pathlib_module in ('pathlib', 'pathlib2'):
try:
pathlib = importlib.import_module(pathlib_module)
PATHLIB_SUPPORT = True
break
except ImportError:
PATHLIB_SUPPORT = False

from boto.compat import BytesIO, urlsplit, six
import boto.s3.connection
import boto.s3.key
Expand Down Expand Up @@ -102,6 +113,7 @@ def smart_open(uri, mode="rb", **kw):
3. a URI for Amazon's S3 (can also supply credentials inside the URI):
`s3://my_bucket/lines.txt`, `s3://my_aws_key_id:key_secret@my_bucket/lines.txt`
4. an instance of the boto.s3.key.Key class.
5. an instance of the pathlib.Path class.

Examples::

Expand Down Expand Up @@ -163,6 +175,10 @@ def smart_open(uri, mode="rb", **kw):
if not isinstance(mode, six.string_types):
raise TypeError('mode should be a string')

# Support opening ``pathlib.Path`` objects by casting them to strings.
if PATHLIB_SUPPORT and isinstance(uri, pathlib.Path):
uri = str(uri)

if isinstance(uri, six.string_types):
# this method just routes the request to classes handling the specific storage
# schemes, depending on the URI protocol in `uri`
Expand Down
32 changes: 32 additions & 0 deletions smart_open/tests/test_smart_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
# This code is distributed under the terms and conditions
# from the MIT License (MIT).

import importlib
import unittest
import logging
import tempfile
import os
import pkgutil
import sys
import hashlib

Expand Down Expand Up @@ -191,6 +193,36 @@ def test_open_with_keywords_explicit_r(self):
actual = fin.read()
self.assertEqual(expected, actual)

@unittest.skipIf(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a simpler way of writing this is:

@unittest.skipIf(smart_open_lib.PATHLIB_SUPPORT is False, "your reason here")

(pkgutil.find_loader('pathlib') is None and
pkgutil.find_loader('pathlib2') is None),
"do not test pathlib support if pathlib or backport are not available")
def test_open_and_read_pathlib_path(self):
"""If ``pathlib.Path`` is available we should be able to open and read."""
fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt')

# Import ``pathlib`` if the builtin ``pathlib`` or the backport
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're duplicating unnecessary work here. We've already gone through the pain of importing the necessary pathlib module in smart_open_lib. So, we can just get rid of all this import stuff, and use:

from smart_open.smart_open_lib import pathlib
path = pathlib.Path('/foo/bar')

whenever you need access to the actual pathlib module.

# ``pathlib2`` are available. The builtin ``pathlib`` will be imported
# with higher precedence.
for pathlib_module in ('pathlib', 'pathlib2'):
try:
pathlib = importlib.import_module(pathlib_module)
break
# Unit test will skip if either module is unavailable so it's safe
# to assume we can import _at least_ one working ``pathlib``.
except ImportError:
pass

# builtin open() supports pathlib.Path in python>=3.6 only
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to bother with this is in the tests. From the point of view of the test, it's an unnecessary detail. In this particular case, we just want to read the expected data in the quickest and simplest possible way. So in this case, I think it's better to just do:

with open(fpath) as fin:
    expected = fin.read().decode('cp852')

path_open = pathlib.Path(fpath) if sys.version_info >= (3, 6) else fpath
path_smart_open = pathlib.Path(fpath)

with open(path_open, 'rb') as fin:
expected = fin.read().decode('cp852')
with smart_open.smart_open(path_smart_open, encoding='cp852') as fin:
actual = fin.read()
self.assertEqual(expected, actual)

@mock_s3
def test_read_never_returns_none(self):
"""read should never return None."""
Expand Down