pandas-dev · jreback · Aug 18, 2015 · Jul 28, 2015 · jreback · Jul 29, 2015
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -172,6 +172,8 @@ Other enhancements
 
 - ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations <whatsnew_0170.deprecations>` (:issue:`6511`, :issue:`8505`)
 
+- ``msgpack`` submodule has been updated to 0.4.6 with backward compatibility (:issue:`10581`)
+
 .. ipython :: python
 
    s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
@@ -669,4 +671,5 @@ Bug Fixes
 - Bug in ``Series([np.nan]).astype('M8[ms]')``, which now returns ``Series([pd.NaT])`` (:issue:`10747`)
 - Bug in ``PeriodIndex.order`` reset freq (:issue:`10295`)
 - Bug in ``iloc`` allowing memory outside bounds of a Series to be accessed with negative integers (:issue:`10779`)
+- Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`)
 - Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`)
diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -60,7 +60,7 @@
 from pandas.core.internals import BlockManager, make_block
 import pandas.core.internals as internals
 
-from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer
+from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType
 
 # until we can pass this into our conversion functions,
 # this is pretty hacky
@@ -131,7 +131,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
         return Iterator(path_or_buf)
 
     def read(fh):
-        l = list(unpack(fh))
+        l = list(unpack(fh, **kwargs))
         if len(l) == 1:
             return l[0]
         return l
@@ -222,7 +222,7 @@ def convert(values):
         # convert to a bytes array
         v = v.tostring()
         import zlib
-        return zlib.compress(v)
+        return ExtType(0, zlib.compress(v))
 
     elif compressor == 'blosc':
 
@@ -233,18 +233,24 @@ def convert(values):
         # convert to a bytes array
         v = v.tostring()
         import blosc
-        return blosc.compress(v, typesize=dtype.itemsize)
+        return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
 
     # ndarray (on original dtype)
-    return v.tostring()
+    return ExtType(0, v.tostring())
 
 
 def unconvert(values, dtype, compress=None):
 
+    as_is_ext = isinstance(values, ExtType) and values.code == 0
+
+    if as_is_ext:
+        values = values.data
+
     if dtype == np.object_:
         return np.array(values, dtype=object)
 
-    values = values.encode('latin1')
+    if not as_is_ext:
+        values = values.encode('latin1')
 
     if compress == 'zlib':
         import zlib
@@ -558,19 +564,23 @@ def create_block(b):
 
 
 def pack(o, default=encode,
-         encoding='latin1', unicode_errors='strict', use_single_float=False):
+         encoding='latin1', unicode_errors='strict', use_single_float=False,
+         autoreset=1, use_bin_type=1):
     """
     Pack an object and return the packed bytes.
     """
 
     return Packer(default=default, encoding=encoding,
                   unicode_errors=unicode_errors,
-                  use_single_float=use_single_float).pack(o)
+                  use_single_float=use_single_float,
+                  autoreset=autoreset,
+                  use_bin_type=use_bin_type).pack(o)
 
 
 def unpack(packed, object_hook=decode,
            list_hook=None, use_list=False, encoding='latin1',
-           unicode_errors='strict', object_pairs_hook=None):
+           unicode_errors='strict', object_pairs_hook=None,
+           max_buffer_size=0, ext_hook=ExtType):
     """
     Unpack a packed object, return an iterator
     Note: packed lists will be returned as tuples
@@ -580,27 +590,33 @@ def unpack(packed, object_hook=decode,
                     list_hook=list_hook,
                     use_list=use_list, encoding=encoding,
                     unicode_errors=unicode_errors,
-                    object_pairs_hook=object_pairs_hook)
+                    object_pairs_hook=object_pairs_hook,
+                    max_buffer_size=max_buffer_size,
+                    ext_hook=ext_hook)
 
 
 class Packer(_Packer):
 
     def __init__(self, default=encode,
                  encoding='latin1',
                  unicode_errors='strict',
-                 use_single_float=False):
+                 use_single_float=False,
+                 autoreset=1,
+                 use_bin_type=1):
         super(Packer, self).__init__(default=default,
                                      encoding=encoding,
                                      unicode_errors=unicode_errors,
-                                     use_single_float=use_single_float)
+                                     use_single_float=use_single_float,
+                                     autoreset=autoreset,
+                                     use_bin_type=use_bin_type)
 
 
 class Unpacker(_Unpacker):
 
     def __init__(self, file_like=None, read_size=0, use_list=False,
                  object_hook=decode,
                  object_pairs_hook=None, list_hook=None, encoding='latin1',
-                 unicode_errors='strict', max_buffer_size=0):
+                 unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
         super(Unpacker, self).__init__(file_like=file_like,
                                        read_size=read_size,
                                        use_list=use_list,
@@ -609,7 +625,8 @@ def __init__(self, file_like=None, read_size=0, use_list=False,
                                        list_hook=list_hook,
                                        encoding=encoding,
                                        unicode_errors=unicode_errors,
-                                       max_buffer_size=max_buffer_size)
+                                       max_buffer_size=max_buffer_size,
+                                       ext_hook=ext_hook)
 
 
 class Iterator(object):

diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py
@@ -54,9 +54,9 @@ def setUp(self):
     def tearDown(self):
         pass
 
-    def encode_decode(self, x, **kwargs):
+    def encode_decode(self, x, compress=None, **kwargs):
         with ensure_clean(self.path) as p:
-            to_msgpack(p, x, **kwargs)
+            to_msgpack(p, x, compress=compress, **kwargs)
             return read_msgpack(p, **kwargs)
 
 class TestAPI(TestPackers):
@@ -517,12 +517,38 @@ def test_compression_blosc(self):
             assert_frame_equal(self.frame[k], i_rec[k])
 
 
+class TestEncoding(TestPackers):
+        def setUp(self):
+            super(TestEncoding, self).setUp()
+            data = {
+                'A': [compat.u('\u2019')] * 1000,
+                'B': np.arange(1000, dtype=np.int32),
+                'C': list(100 * 'abcdefghij'),
+                'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
+                'E': [datetime.timedelta(days=x) for x in range(1000)],
+                'G': [400] * 1000
+            }
+            self.frame = {
+                'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])),
+                'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])),
+                'mixed': DataFrame(data),
+            }
+            self.utf_encodings = ['utf8', 'utf16', 'utf32']
+
+        def test_utf(self):
+            # GH10581
+            for encoding in self.utf_encodings:
+                for frame in compat.itervalues(self.frame):
+                    result = self.encode_decode(frame, encoding=encoding)
+                    assert_frame_equal(result, frame)
+
+
 class TestMsgpack():
     """
     How to add msgpack tests:
 
     1. Install pandas version intended to output the msgpack.
-
+TestPackers
     2. Execute "generate_legacy_storage_files.py" to create the msgpack.
     $ python generate_legacy_storage_files.py <output_dir> msgpack