From b46dc97f5da4089651dea898d4837166b1ce71c9 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 20 Feb 2019 15:14:56 +0100 Subject: [PATCH 1/4] Add support for .xz / lzma Signed-off-by: Vadim Markovtsev --- smart_open/smart_open_lib.py | 21 ++++++++--- smart_open/tests/test_smart_open.py | 54 ++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 80a3c1cc..cea55b26 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -59,6 +59,12 @@ import gzip +COMPRESSED_EXT = ('.gz', '.bz2') # supported compressed file extensions + +if not IS_PY2: + import lzma + COMPRESSED_EXT += '.xz', + # # This module defines a function called smart_open so we cannot use # smart_open.submodule to reference to the submodules. @@ -121,8 +127,8 @@ def smart_open(uri, mode="rb", **kw): The `uri` can be either: - 1. a URI for the local filesystem (compressed ``.gz`` or ``.bz2`` files handled automatically): - `./lines.txt`, `/home/joe/lines.txt.gz`, `file:///home/joe/lines.txt.bz2` + 1. a URI for the local filesystem (compressed ``.gz``, ``.bz2`` or ``.xz`` files handled + automatically): `./lines.txt`, `/home/joe/lines.txt.gz`, `file:///home/joe/lines.txt.bz2` 2. a URI for HDFS: `hdfs:///some/path/lines.txt` 3. a URI for Amazon's S3 (can also supply credentials inside the URI): `s3://my_bucket/lines.txt`, `s3://my_aws_key_id:key_secret@my_bucket/lines.txt` @@ -165,6 +171,8 @@ def smart_open(uri, mode="rb", **kw): ... fout.write("hello world!\n") >>> with smart_open.smart_open('/home/radim/another.txt.bz2', 'wb') as fout: ... fout.write("good bye!\n") + >>> with smart_open.smart_open('/home/radim/another.txt.xz', 'wb') as fout: + ... fout.write("never say never!\n") >>> # stream from/to (compressed) local files with Expand ~ and ~user constructions: >>> for line in smart_open.smart_open('~/my_file.txt'): ... print line @@ -270,7 +278,7 @@ def _shortcut_open(uri, mode, **kw): _, extension = P.splitext(parsed_uri.uri_path) ignore_extension = kw.get('ignore_extension', False) - if extension in ('.gz', '.bz2') and not ignore_extension: + if extension in COMPRESSED_EXT and not ignore_extension: return None # @@ -330,7 +338,7 @@ def _open_binary_stream(uri, mode, **kw): if parsed_uri.scheme in ("file", ): # local files -- both read & write supported - # compression, if any, is determined by the filename extension (.gz, .bz2) + # compression, if any, is determined by the filename extension (.gz, .bz2, .xz) fobj = io.open(parsed_uri.uri_path, mode) return fobj, filename elif parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES: @@ -431,6 +439,7 @@ def _parse_uri(uri_as_string): * ./local/path/file.gz * file:///home/user/file * file:///home/user/file.bz2 + * file:///home/user/file.xz """ if os.name == 'nt': # urlsplit doesn't work on Windows -- it parses the drive as the scheme... @@ -552,7 +561,7 @@ def _need_to_buffer(file_obj, mode, ext): # .seekable, but have a .seek method instead. # is_seekable = hasattr(file_obj, 'seek') - return six.PY2 and mode.startswith('r') and ext in ('.gz', '.bz2') and not is_seekable + return six.PY2 and mode.startswith('r') and ext in COMPRESSED_EXT and not is_seekable def _compression_wrapper(file_obj, filename, mode): @@ -576,6 +585,8 @@ def _compression_wrapper(file_obj, filename, mode): return BZ2File(file_obj, mode) elif ext == '.gz': return gzip.GzipFile(fileobj=file_obj, mode=mode) + elif ext == '.xz' and not IS_PY2: + return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) else: return file_obj diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index cc567d2e..3fe66f4e 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -259,7 +259,35 @@ def test_http_bz2(self): body=compressed_data, stream=True) smart_open_object = smart_open.smart_open("http://127.0.0.1/data.bz2") - # decompress the gzip and get the same md5 hash + # decompress the bzip2 and get the same md5 hash + self.assertEqual(smart_open_object.read(), test_string) + + @responses.activate + @unittest.skipIf(six.PY2, 'Py2 does not have a built-in lzma codec') + def test_http_xz(self): + """Can open xz via http?""" + test_string = b'Hello World Compressed.' + # + # TODO: why are these tests writing to temporary files? We can do the + # lzma compression in memory. + # + with tempfile.NamedTemporaryFile('wb', suffix='.xz', delete=False) as infile: + test_file = infile.name + + with smart_open.smart_open(test_file, 'wb') as outfile: + outfile.write(test_string) + + with open(test_file, 'rb') as infile: + compressed_data = infile.read() + + if os.path.isfile(test_file): + os.unlink(test_file) + + responses.add(responses.GET, "http://127.0.0.1/data.xz", + body=compressed_data, stream=True) + smart_open_object = smart_open.smart_open("http://127.0.0.1/data.xz") + + # decompress the xz and get the same md5 hash self.assertEqual(smart_open_object.read(), test_string) @@ -864,8 +892,12 @@ class CompressionFormatTest(unittest.TestCase): TEXT = 'Hello' def write_read_assertion(self, test_file): + text = self.TEXT.encode('utf8') with smart_open.smart_open(test_file, 'wb') as fout: # 'b' for binary, needed on Windows - fout.write(self.TEXT.encode('utf8')) + fout.write(text) + + with open(test_file, 'rb') as fin: + self.assertNotEqual(text, fin.read()) with smart_open.smart_open(test_file, 'rb') as fin: self.assertEqual(fin.read().decode('utf8'), self.TEXT) @@ -882,17 +914,23 @@ def test_open_gz(self): assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', \ 'Failed to read gzip' - def test_write_read_gz(self): - """Can write and read gzip?""" - with tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False) as infile: + def _test_write_read(self, suffix): + with tempfile.NamedTemporaryFile('wb', suffix=suffix, delete=False) as infile: test_file_name = infile.name self.write_read_assertion(test_file_name) + def test_write_read_gz(self): + """Can write and read gzip?""" + self._test_write_read('.gz') + def test_write_read_bz2(self): """Can write and read bz2?""" - with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile: - test_file_name = infile.name - self.write_read_assertion(test_file_name) + self._test_write_read('.bz2') + + @unittest.skipIf(six.PY2, 'Py2 does not have a built-in lzma codec') + def test_write_read_xz(self): + """Can write and read xz2?""" + self._test_write_read('.xz') class MultistreamsBZ2Test(unittest.TestCase): From ae4f0198112a50a4a135b32fa9d88bf3d612809f Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 20 Feb 2019 16:01:32 +0100 Subject: [PATCH 2/4] Support xz on py2.7 Signed-off-by: Vadim Markovtsev --- setup.py | 3 ++- smart_open/smart_open_lib.py | 12 +++++++----- smart_open/tests/test_smart_open.py | 2 -- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index d3ee1a33..f6fac6a1 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,8 @@ def read(fname): 'boto >= 2.32', 'bz2file', 'requests', - 'boto3' + 'boto3', + 'backports.lzma;python_version<"3.3"', ], tests_require=tests_require, extras_require={ diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index cea55b26..240ad2e3 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -58,12 +58,14 @@ from bz2 import BZ2File import gzip +try: + import lzma +except ImportError: + # py<3.3 + from backports import lzma -COMPRESSED_EXT = ('.gz', '.bz2') # supported compressed file extensions -if not IS_PY2: - import lzma - COMPRESSED_EXT += '.xz', +COMPRESSED_EXT = ('.gz', '.bz2', '.xz') # supported compressed file extensions # # This module defines a function called smart_open so we cannot use @@ -585,7 +587,7 @@ def _compression_wrapper(file_obj, filename, mode): return BZ2File(file_obj, mode) elif ext == '.gz': return gzip.GzipFile(fileobj=file_obj, mode=mode) - elif ext == '.xz' and not IS_PY2: + elif ext == '.xz': return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) else: return file_obj diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index 3fe66f4e..b2628cb4 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -263,7 +263,6 @@ def test_http_bz2(self): self.assertEqual(smart_open_object.read(), test_string) @responses.activate - @unittest.skipIf(six.PY2, 'Py2 does not have a built-in lzma codec') def test_http_xz(self): """Can open xz via http?""" test_string = b'Hello World Compressed.' @@ -927,7 +926,6 @@ def test_write_read_bz2(self): """Can write and read bz2?""" self._test_write_read('.bz2') - @unittest.skipIf(six.PY2, 'Py2 does not have a built-in lzma codec') def test_write_read_xz(self): """Can write and read xz2?""" self._test_write_read('.xz') From ddf60c77cde5084aa5d5f7f838631fa6e67af492 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 20 Feb 2019 16:13:33 +0100 Subject: [PATCH 3/4] Specify the supported archive formats in the README Signed-off-by: Vadim Markovtsev --- README.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index b0fa89d2..a6b987cd 100644 --- a/README.rst +++ b/README.rst @@ -14,7 +14,7 @@ What? ``smart_open`` is a Python 2 & Python 3 library for **efficient streaming of very large files** from/to S3, HDFS, WebHDFS, HTTP, or local (compressed) files. It's a drop-in replacement for Python's built-in ``open()``: it can do anything ``open`` can (100% compatible, falls back to native ``open`` wherever possible), plus lots of nifty extra stuff on top. -``smart_open`` is well-tested, well-documented and sports a simple, Pythonic API: +``smart_open`` is well-tested, well-documented, and has a simple, Pythonic API: .. code-block:: python @@ -100,6 +100,11 @@ Or, if you prefer to install from the `source tar.gz `_ , `moto `_ and `responses `_ (``pip install mock moto responses``). The tests are also run automatically with `Travis CI `_ on every commit push & pull request. +Supported archive types +----------------------- +``smart_open`` allows reading and writing gzip, bzip2 and xz files. They are transparently handled +over HTTP, too. + S3-Specific Options ------------------- From 8363eb27a0540676461599e4374fa7cd84559e2c Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Thu, 21 Feb 2019 07:53:41 +0100 Subject: [PATCH 4/4] Apply review suggestions Signed-off-by: Vadim Markovtsev --- README.rst | 2 +- .../test_data/crime-and-punishment.txt.xz | Bin 0 -> 1200 bytes smart_open/tests/test_smart_open.py | 32 +++++++++++------- 3 files changed, 20 insertions(+), 14 deletions(-) create mode 100644 smart_open/tests/test_data/crime-and-punishment.txt.xz diff --git a/README.rst b/README.rst index a6b987cd..818ef232 100644 --- a/README.rst +++ b/README.rst @@ -103,7 +103,7 @@ To run the unit tests (optional), you'll also need to install `mock xk#fDHy3l+|x5Q=yr9 z+JJGXF2|EL;#c3KPg8a2S{IQDbXyzyF{&e~ZV)9jd~q4<^+U~PsmTZ*iRl0=I+Xi7Dn$)Gp3j2$$MUQZ0d!sg8YrDaTxYvZL@EO?z2 zrP-HxYeO~>JN9Y62o)mBy!V`C!rRq`ZRoFTk#s*ogbK8knkmzmG>jkK=Wsw(A#Fo# zvG68!5Fm!J%DBEs+#+rxeV*EE%75Ht@GPinTDMI+(5F;KzPaAh9~kl=G-^s|^%=05ohi9gOV@cGTkUOz4)d(d8Xi4*EcR>MZbRE+T}TyYDYus;wJ$ zqj~eABMH%4*k&dx^XKOXCNm@jxRk}<*0 zZZE}*rjn_o_-+29Nm}d;ZKw;meT%N81zzD$(qPnKZZ4nJb81XB3+9XHemA)4d`5G*i3Fw@VhENeEfoP z=8C(oxp!5qbe=+!rb$ruy5r1+T`p?@nA%ZlXod?0{Tj{&C!d}TOg(V`Pne}L08%)x z6LsQ!Nw-sDZCgeG6i0b^H;dC-TI!PN?h)W~;o4iJP?~L-h;J(4<5p+05#bSpQY zJOVl7&o~7(*Pq1D1`^B`Qg^U9^@uoJE zGMAc9SngIFX&c>GZ@MM(ZwOY=sGb-aggC+BX0@|wnf2_m4#kFH4TY9+^pj!dK!6st zU*IIz`uT43)WXcu<<7V$lF*n$%uK222j7pbg=M25x@A138NZ5tMl|H7sM5X zBv)#{6ObTuttmw40oKU&OzznzNECQ%O&UI;LfAT~LmqWBeEKh`SJOoR(ebzOp1tE! zgrwVMpHz$LrWE?>Y=zUdn1FY8$9;H8R|gT+yPyE}T}*>bd&nvP0gMUt7XScnG3F1k O#Ao{g000001X)^;&P8_s literal 0 HcmV?d00001 diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index b2628cb4..c29da7cc 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -255,8 +255,7 @@ def test_http_bz2(self): if os.path.isfile(test_file): os.unlink(test_file) - responses.add(responses.GET, "http://127.0.0.1/data.bz2", - body=compressed_data, stream=True) + responses.add(responses.GET, "http://127.0.0.1/data.bz2", body=compressed_data, stream=True) smart_open_object = smart_open.smart_open("http://127.0.0.1/data.bz2") # decompress the bzip2 and get the same md5 hash @@ -282,8 +281,7 @@ def test_http_xz(self): if os.path.isfile(test_file): os.unlink(test_file) - responses.add(responses.GET, "http://127.0.0.1/data.xz", - body=compressed_data, stream=True) + responses.add(responses.GET, "http://127.0.0.1/data.xz", body=compressed_data, stream=True) smart_open_object = smart_open.smart_open("http://127.0.0.1/data.xz") # decompress the xz and get the same md5 hash @@ -890,7 +888,10 @@ class CompressionFormatTest(unittest.TestCase): TEXT = 'Hello' - def write_read_assertion(self, test_file): + def write_read_assertion(self, suffix): + with tempfile.NamedTemporaryFile('wb', suffix=suffix, delete=False) as infile: + test_file = infile.name + text = self.TEXT.encode('utf8') with smart_open.smart_open(test_file, 'wb') as fout: # 'b' for binary, needed on Windows fout.write(text) @@ -913,22 +914,27 @@ def test_open_gz(self): assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', \ 'Failed to read gzip' - def _test_write_read(self, suffix): - with tempfile.NamedTemporaryFile('wb', suffix=suffix, delete=False) as infile: - test_file_name = infile.name - self.write_read_assertion(test_file_name) - def test_write_read_gz(self): """Can write and read gzip?""" - self._test_write_read('.gz') + self.write_read_assertion('.gz') def test_write_read_bz2(self): """Can write and read bz2?""" - self._test_write_read('.bz2') + self.write_read_assertion('.bz2') def test_write_read_xz(self): """Can write and read xz2?""" - self._test_write_read('.xz') + self.write_read_assertion('.xz') + + def test_read_real_xz(self): + """Can read a real xz file.""" + base_path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt') + head_path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt.xz') + with smart_open.smart_open(head_path) as f: + smart_data = f.read() + with open(base_path, 'rb') as f: + orig_data = f.read() + self.assertEqual(smart_data, orig_data) class MultistreamsBZ2Test(unittest.TestCase):