Skip to content

BUG: avoid "b" prefix for bytes in to_csv() on Python 3 (#9712) #13890

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -968,3 +968,5 @@ Bug Fixes
- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`)

- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)

- Bug in ``to_csv()`` in Python 3 which emitted b'' around bytes (:issue:`9712`)
8 changes: 8 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2020,6 +2020,14 @@ def re_replacer(s):

return block

def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
bytes_encoding=None, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move this to Block, where to_native_types already exists, you just need to incorporate this argument after stringifying (we have another conditional if quoting or not, but and am not sure if you need to do this or not for that one as I think the csv writer handles that).

around line 590.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put it in ObjectBlock because that's the only class that could contain objects--correct? Given that a precondition of the code I'm adding is dtype==object, are you sure you'd want the logic to go in the base class for all types (even if it's overridden by some)?

result = Block.to_native_types(self, slicer, na_rep, quoting, **kwargs)
if bytes_encoding is not None:
for arr in result:
lib.object_array_decode_bytes(arr, bytes_encoding)
return result


class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
__slots__ = ()
Expand Down
20 changes: 20 additions & 0 deletions pandas/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1378,6 +1378,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and
not self.tupleize_cols)

# in Python 3, decode bytes to str so strings print without b''
if compat.PY3:
self.bytes_encoding = (encoding or get_option("display.encoding"))
Copy link
Contributor

@jreback jreback Aug 6, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we calling this bytes_encoding? we can simply use encoding

else:
self.bytes_encoding = None

# validate mi options
if self.has_mi_columns:
if cols is not None:
Expand All @@ -1387,6 +1393,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
if cols is not None:
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
bytes_encoding=self.bytes_encoding,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
Expand All @@ -1399,6 +1406,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
cols = self.obj.columns
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
bytes_encoding=self.bytes_encoding,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
Expand Down Expand Up @@ -1506,6 +1514,8 @@ def _save_header(self):
else:
encoded_labels = []

self._bytes_to_str(encoded_labels)

if not has_mi_columns:
encoded_labels += list(write_cols)

Expand Down Expand Up @@ -1565,6 +1575,7 @@ def _save_chunk(self, start_i, end_i):
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
bytes_encoding=self.bytes_encoding,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
Expand All @@ -1575,13 +1586,22 @@ def _save_chunk(self, start_i, end_i):
self.data[col_loc] = col

ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
bytes_encoding=self.bytes_encoding,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)

lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)

def _bytes_to_str(self, values):
"""Modify values list by decoding bytes to str."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I think if we have mixed bytes & strings then we should simply raise a ValueError.

you can use lib.is_bytes_array(arr), pass it an array (e.g. here you could do np.asarray(values) to ensure the arr)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried using lib.is_bytes_array(arr) now. The problem is, it returns False if there are NaNs. So an array with all bytes apart from some "missing" values is not converted. This doesn't work, because CSV output by default prints NaN elements in string columns as empty strings.

if self.bytes_encoding:
for ii, value in enumerate(values):
if isinstance(value, bytes):
values[ii] = value.decode(self.bytes_encoding)


# from collections import namedtuple
# ExcelCell = namedtuple("ExcelCell",
# 'row, col, val, style, mergestart, mergeend')
Expand Down
7 changes: 5 additions & 2 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1580,12 +1580,15 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
result = _trim_front(format_array(values, None, justify='left'))
return header + result

def to_native_types(self, slicer=None, **kwargs):
def to_native_types(self, slicer=None, bytes_encoding=None, **kwargs):
""" slice and dice then format """
values = self
if slicer is not None:
values = values[slicer]
return values._format_native_types(**kwargs)
result = values._format_native_types(**kwargs)
if bytes_encoding is not None and result.dtype == object:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use is_object_dtype function

lib.object_array_decode_bytes(result, bytes_encoding)
return result

def _format_native_types(self, na_rep='', quoting=None, **kwargs):
""" actually format my specific types """
Expand Down
19 changes: 19 additions & 0 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,25 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re

return arr

@cython.boundscheck(False)
@cython.wraparound(False)
def object_array_decode_bytes(ndarray[object, ndim=1] arr, object encoding):
"""Decode any instances of bytes to str in arr using the given encoding."""
if bytes == str: # in Python 2 these are the same and nothing needs to be done
return

cdef int length = arr.shape[0], i = 0
for i from 0 <= i < length:
if isinstance(arr[i], bytes):
arr[i] = arr[i].decode(encoding)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again instead of writing this, simply use pandas.core.strings.str_decode it already takes an array, handles nulls and does the right thing.

We have a tremendous amount of code already in pandas. So we really really try to avoid duplication.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

furthermore that has been optimized to use certain codecs which python is apparently very fast at.

elif isinstance(arr[i], tuple):
mask = [isinstance(it, bytes) for it in arr[i]]
if any(mask):
val = [it.decode(encoding) if mask[j] else it for j, it in enumerate(arr[i])]
arr[i] = tuple(val)

return arr

@cython.boundscheck(False)
@cython.wraparound(False)
def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer):
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/frame/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,28 @@ def test_to_csv_unicode_index_col(self):
df2 = read_csv(buf, index_col=0, encoding='UTF-8')
assert_frame_equal(df, df2)

def test_to_csv_bytes(self):
# GH 9712
times = pd.date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H")
df = DataFrame.from_items([
(b'hello', ['a', b'b']),
(b'times', times),
])
df.loc[2] = np.nan
df.index.name = 'idx'

with ensure_clean() as path:
df.to_csv(path)
with open(path) as csvfile:
lines = csvfile.readlines()

expected = [
"idx,hello,times\n",
"0,a,2013-10-27 23:00:00\n",
"1,b,2013-10-28 00:00:00\n", "2,,\n",
]
assert(lines == expected)

def test_to_csv_stringio(self):
buf = StringIO()
self.frame.to_csv(buf)
Expand Down