Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for .xz / lzma #262

Merged
merged 4 commits into from
Feb 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ What?

``smart_open`` is a Python 2 & Python 3 library for **efficient streaming of very large files** from/to S3, HDFS, WebHDFS, HTTP, or local (compressed) files. It's a drop-in replacement for Python's built-in ``open()``: it can do anything ``open`` can (100% compatible, falls back to native ``open`` wherever possible), plus lots of nifty extra stuff on top.

``smart_open`` is well-tested, well-documented and sports a simple, Pythonic API:
``smart_open`` is well-tested, well-documented, and has a simple, Pythonic API:

.. code-block:: python

Expand Down Expand Up @@ -100,6 +100,11 @@ Or, if you prefer to install from the `source tar.gz <http://pypi.python.org/pyp

To run the unit tests (optional), you'll also need to install `mock <https://pypi.python.org/pypi/mock>`_ , `moto <https://github.com/spulec/moto>`_ and `responses <https://github.com/getsentry/responses>`_ (``pip install mock moto responses``). The tests are also run automatically with `Travis CI <https://travis-ci.org/RaRe-Technologies/smart_open>`_ on every commit push & pull request.

Supported archive types
-----------------------
``smart_open`` allows reading and writing gzip, bzip2 and xz files. They are transparently handled
over HTTP, S3, and other protocols, too.

S3-Specific Options
-------------------

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def read(fname):
'boto >= 2.32',
'bz2file',
'requests',
'boto3'
'boto3',
'backports.lzma;python_version<"3.3"',
],
tests_require=tests_require,
extras_require={
Expand Down
23 changes: 18 additions & 5 deletions smart_open/smart_open_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@
from bz2 import BZ2File

import gzip
try:
import lzma
except ImportError:
# py<3.3
from backports import lzma


COMPRESSED_EXT = ('.gz', '.bz2', '.xz') # supported compressed file extensions

#
# This module defines a function called smart_open so we cannot use
Expand Down Expand Up @@ -121,8 +129,8 @@ def smart_open(uri, mode="rb", **kw):

The `uri` can be either:

1. a URI for the local filesystem (compressed ``.gz`` or ``.bz2`` files handled automatically):
`./lines.txt`, `/home/joe/lines.txt.gz`, `file:///home/joe/lines.txt.bz2`
1. a URI for the local filesystem (compressed ``.gz``, ``.bz2`` or ``.xz`` files handled
automatically): `./lines.txt`, `/home/joe/lines.txt.gz`, `file:///home/joe/lines.txt.bz2`
2. a URI for HDFS: `hdfs:///some/path/lines.txt`
3. a URI for Amazon's S3 (can also supply credentials inside the URI):
`s3://my_bucket/lines.txt`, `s3://my_aws_key_id:key_secret@my_bucket/lines.txt`
Expand Down Expand Up @@ -165,6 +173,8 @@ def smart_open(uri, mode="rb", **kw):
... fout.write("hello world!\n")
>>> with smart_open.smart_open('/home/radim/another.txt.bz2', 'wb') as fout:
... fout.write("good bye!\n")
>>> with smart_open.smart_open('/home/radim/another.txt.xz', 'wb') as fout:
... fout.write("never say never!\n")
>>> # stream from/to (compressed) local files with Expand ~ and ~user constructions:
>>> for line in smart_open.smart_open('~/my_file.txt'):
... print line
Expand Down Expand Up @@ -270,7 +280,7 @@ def _shortcut_open(uri, mode, **kw):

_, extension = P.splitext(parsed_uri.uri_path)
ignore_extension = kw.get('ignore_extension', False)
if extension in ('.gz', '.bz2') and not ignore_extension:
if extension in COMPRESSED_EXT and not ignore_extension:
return None

#
Expand Down Expand Up @@ -330,7 +340,7 @@ def _open_binary_stream(uri, mode, **kw):

if parsed_uri.scheme in ("file", ):
# local files -- both read & write supported
# compression, if any, is determined by the filename extension (.gz, .bz2)
# compression, if any, is determined by the filename extension (.gz, .bz2, .xz)
fobj = io.open(parsed_uri.uri_path, mode)
return fobj, filename
elif parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES:
Expand Down Expand Up @@ -431,6 +441,7 @@ def _parse_uri(uri_as_string):
* ./local/path/file.gz
* file:///home/user/file
* file:///home/user/file.bz2
* file:///home/user/file.xz
"""
if os.name == 'nt':
# urlsplit doesn't work on Windows -- it parses the drive as the scheme...
Expand Down Expand Up @@ -552,7 +563,7 @@ def _need_to_buffer(file_obj, mode, ext):
# .seekable, but have a .seek method instead.
#
is_seekable = hasattr(file_obj, 'seek')
return six.PY2 and mode.startswith('r') and ext in ('.gz', '.bz2') and not is_seekable
return six.PY2 and mode.startswith('r') and ext in COMPRESSED_EXT and not is_seekable


def _compression_wrapper(file_obj, filename, mode):
Expand All @@ -576,6 +587,8 @@ def _compression_wrapper(file_obj, filename, mode):
return BZ2File(file_obj, mode)
elif ext == '.gz':
return gzip.GzipFile(fileobj=file_obj, mode=mode)
elif ext == '.xz':
return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ)
else:
return file_obj

Expand Down
Binary file not shown.
64 changes: 53 additions & 11 deletions smart_open/tests/test_smart_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,11 +255,36 @@ def test_http_bz2(self):
if os.path.isfile(test_file):
os.unlink(test_file)

responses.add(responses.GET, "http://127.0.0.1/data.bz2",
body=compressed_data, stream=True)
responses.add(responses.GET, "http://127.0.0.1/data.bz2", body=compressed_data, stream=True)
smart_open_object = smart_open.smart_open("http://127.0.0.1/data.bz2")

# decompress the gzip and get the same md5 hash
# decompress the bzip2 and get the same md5 hash
self.assertEqual(smart_open_object.read(), test_string)

@responses.activate
def test_http_xz(self):
"""Can open xz via http?"""
test_string = b'Hello World Compressed.'
#
# TODO: why are these tests writing to temporary files? We can do the
# lzma compression in memory.
#
with tempfile.NamedTemporaryFile('wb', suffix='.xz', delete=False) as infile:
test_file = infile.name

with smart_open.smart_open(test_file, 'wb') as outfile:
outfile.write(test_string)

with open(test_file, 'rb') as infile:
compressed_data = infile.read()

if os.path.isfile(test_file):
os.unlink(test_file)

responses.add(responses.GET, "http://127.0.0.1/data.xz", body=compressed_data, stream=True)
smart_open_object = smart_open.smart_open("http://127.0.0.1/data.xz")

# decompress the xz and get the same md5 hash
self.assertEqual(smart_open_object.read(), test_string)


Expand Down Expand Up @@ -863,9 +888,16 @@ class CompressionFormatTest(unittest.TestCase):

TEXT = 'Hello'

def write_read_assertion(self, test_file):
def write_read_assertion(self, suffix):
with tempfile.NamedTemporaryFile('wb', suffix=suffix, delete=False) as infile:
test_file = infile.name

text = self.TEXT.encode('utf8')
with smart_open.smart_open(test_file, 'wb') as fout: # 'b' for binary, needed on Windows
fout.write(self.TEXT.encode('utf8'))
fout.write(text)

with open(test_file, 'rb') as fin:
self.assertNotEqual(text, fin.read())

with smart_open.smart_open(test_file, 'rb') as fin:
self.assertEqual(fin.read().decode('utf8'), self.TEXT)
Expand All @@ -884,15 +916,25 @@ def test_open_gz(self):

def test_write_read_gz(self):
"""Can write and read gzip?"""
with tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False) as infile:
test_file_name = infile.name
self.write_read_assertion(test_file_name)
self.write_read_assertion('.gz')

def test_write_read_bz2(self):
"""Can write and read bz2?"""
with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile:
test_file_name = infile.name
self.write_read_assertion(test_file_name)
self.write_read_assertion('.bz2')

def test_write_read_xz(self):
"""Can write and read xz2?"""
self.write_read_assertion('.xz')

def test_read_real_xz(self):
"""Can read a real xz file."""
base_path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt')
head_path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt.xz')
with smart_open.smart_open(head_path) as f:
smart_data = f.read()
with open(base_path, 'rb') as f:
orig_data = f.read()
self.assertEqual(smart_data, orig_data)


class MultistreamsBZ2Test(unittest.TestCase):
Expand Down