Skip to content

BUG: GH10581 where read_msgpack does not respect encoding #10686

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 18, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ Other enhancements

- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations <whatsnew_0170.deprecations>` (:issue:`6511`, :issue:`8505`)

- ``msgpack`` submodule has been updated to 0.4.6 with backward compatibility (:issue:`10581`)

.. ipython :: python

s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
Expand Down Expand Up @@ -669,4 +671,5 @@ Bug Fixes
- Bug in ``Series([np.nan]).astype('M8[ms]')``, which now returns ``Series([pd.NaT])`` (:issue:`10747`)
- Bug in ``PeriodIndex.order`` reset freq (:issue:`10295`)
- Bug in ``iloc`` allowing memory outside bounds of a Series to be accessed with negative integers (:issue:`10779`)
- Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`)
- Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`)
45 changes: 31 additions & 14 deletions pandas/io/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
from pandas.core.internals import BlockManager, make_block
import pandas.core.internals as internals

from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer
from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType

# until we can pass this into our conversion functions,
# this is pretty hacky
Expand Down Expand Up @@ -131,7 +131,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
return Iterator(path_or_buf)

def read(fh):
l = list(unpack(fh))
l = list(unpack(fh, **kwargs))
if len(l) == 1:
return l[0]
return l
Expand Down Expand Up @@ -222,7 +222,7 @@ def convert(values):
# convert to a bytes array
v = v.tostring()
import zlib
return zlib.compress(v)
return ExtType(0, zlib.compress(v))

elif compressor == 'blosc':

Expand All @@ -233,18 +233,24 @@ def convert(values):
# convert to a bytes array
v = v.tostring()
import blosc
return blosc.compress(v, typesize=dtype.itemsize)
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))

# ndarray (on original dtype)
return v.tostring()
return ExtType(0, v.tostring())


def unconvert(values, dtype, compress=None):

as_is_ext = isinstance(values, ExtType) and values.code == 0

if as_is_ext:
values = values.data

if dtype == np.object_:
return np.array(values, dtype=object)

values = values.encode('latin1')
if not as_is_ext:
values = values.encode('latin1')

if compress == 'zlib':
import zlib
Expand Down Expand Up @@ -558,19 +564,23 @@ def create_block(b):


def pack(o, default=encode,
encoding='latin1', unicode_errors='strict', use_single_float=False):
encoding='latin1', unicode_errors='strict', use_single_float=False,
autoreset=1, use_bin_type=1):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is use_bin_type=1 here but 0 below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. I will change the other one to 1.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do these new args mean anyhow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use_bin_type is for a new binary format; str 8 was introduced at the same time, so that arg enables both, though they needn't be linked. I think it's good to differentiate binary and strings, since in python there is a distinction. In PY2 you wouldn't be able to round-trip str--which is really bytes--using utf 16 or utf 32 encoding without that enabled. autoreset is not new. I don't know if the omission from packers was intentional or not.

"""
Pack an object and return the packed bytes.
"""

return Packer(default=default, encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float).pack(o)
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type).pack(o)


def unpack(packed, object_hook=decode,
list_hook=None, use_list=False, encoding='latin1',
unicode_errors='strict', object_pairs_hook=None):
unicode_errors='strict', object_pairs_hook=None,
max_buffer_size=0, ext_hook=ExtType):
"""
Unpack a packed object, return an iterator
Note: packed lists will be returned as tuples
Expand All @@ -580,27 +590,33 @@ def unpack(packed, object_hook=decode,
list_hook=list_hook,
use_list=use_list, encoding=encoding,
unicode_errors=unicode_errors,
object_pairs_hook=object_pairs_hook)
object_pairs_hook=object_pairs_hook,
max_buffer_size=max_buffer_size,
ext_hook=ext_hook)


class Packer(_Packer):

def __init__(self, default=encode,
encoding='latin1',
unicode_errors='strict',
use_single_float=False):
use_single_float=False,
autoreset=1,
use_bin_type=1):
super(Packer, self).__init__(default=default,
encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float)
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type)


class Unpacker(_Unpacker):

def __init__(self, file_like=None, read_size=0, use_list=False,
object_hook=decode,
object_pairs_hook=None, list_hook=None, encoding='latin1',
unicode_errors='strict', max_buffer_size=0):
unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
super(Unpacker, self).__init__(file_like=file_like,
read_size=read_size,
use_list=use_list,
Expand All @@ -609,7 +625,8 @@ def __init__(self, file_like=None, read_size=0, use_list=False,
list_hook=list_hook,
encoding=encoding,
unicode_errors=unicode_errors,
max_buffer_size=max_buffer_size)
max_buffer_size=max_buffer_size,
ext_hook=ext_hook)


class Iterator(object):
Expand Down
32 changes: 29 additions & 3 deletions pandas/io/tests/test_packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ def setUp(self):
def tearDown(self):
pass

def encode_decode(self, x, **kwargs):
def encode_decode(self, x, compress=None, **kwargs):
with ensure_clean(self.path) as p:
to_msgpack(p, x, **kwargs)
to_msgpack(p, x, compress=compress, **kwargs)
return read_msgpack(p, **kwargs)

class TestAPI(TestPackers):
Expand Down Expand Up @@ -517,12 +517,38 @@ def test_compression_blosc(self):
assert_frame_equal(self.frame[k], i_rec[k])


class TestEncoding(TestPackers):
def setUp(self):
super(TestEncoding, self).setUp()
data = {
'A': [compat.u('\u2019')] * 1000,
'B': np.arange(1000, dtype=np.int32),
'C': list(100 * 'abcdefghij'),
'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
'E': [datetime.timedelta(days=x) for x in range(1000)],
'G': [400] * 1000
}
self.frame = {
'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])),
'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])),
'mixed': DataFrame(data),
}
self.utf_encodings = ['utf8', 'utf16', 'utf32']

def test_utf(self):
# GH10581
for encoding in self.utf_encodings:
for frame in compat.itervalues(self.frame):
result = self.encode_decode(frame, encoding=encoding)
assert_frame_equal(result, frame)


class TestMsgpack():
"""
How to add msgpack tests:

1. Install pandas version intended to output the msgpack.

TestPackers
2. Execute "generate_legacy_storage_files.py" to create the msgpack.
$ python generate_legacy_storage_files.py <output_dir> msgpack

Expand Down
Loading