-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: avoid "b" prefix for bytes in to_csv() on Python 3 (#9712) #13890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1378,6 +1378,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', | |
self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and | ||
not self.tupleize_cols) | ||
|
||
# in Python 3, decode bytes to str so strings print without b'' | ||
if compat.PY3: | ||
self.bytes_encoding = (encoding or get_option("display.encoding")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we calling this |
||
else: | ||
self.bytes_encoding = None | ||
|
||
# validate mi options | ||
if self.has_mi_columns: | ||
if cols is not None: | ||
|
@@ -1387,6 +1393,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', | |
if cols is not None: | ||
if isinstance(cols, Index): | ||
cols = cols.to_native_types(na_rep=na_rep, | ||
bytes_encoding=self.bytes_encoding, | ||
float_format=float_format, | ||
date_format=date_format, | ||
quoting=self.quoting) | ||
|
@@ -1399,6 +1406,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', | |
cols = self.obj.columns | ||
if isinstance(cols, Index): | ||
cols = cols.to_native_types(na_rep=na_rep, | ||
bytes_encoding=self.bytes_encoding, | ||
float_format=float_format, | ||
date_format=date_format, | ||
quoting=self.quoting) | ||
|
@@ -1506,6 +1514,8 @@ def _save_header(self): | |
else: | ||
encoded_labels = [] | ||
|
||
self._bytes_to_str(encoded_labels) | ||
|
||
if not has_mi_columns: | ||
encoded_labels += list(write_cols) | ||
|
||
|
@@ -1565,6 +1575,7 @@ def _save_chunk(self, start_i, end_i): | |
for i in range(len(self.blocks)): | ||
b = self.blocks[i] | ||
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, | ||
bytes_encoding=self.bytes_encoding, | ||
float_format=self.float_format, | ||
decimal=self.decimal, | ||
date_format=self.date_format, | ||
|
@@ -1575,13 +1586,22 @@ def _save_chunk(self, start_i, end_i): | |
self.data[col_loc] = col | ||
|
||
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, | ||
bytes_encoding=self.bytes_encoding, | ||
float_format=self.float_format, | ||
decimal=self.decimal, | ||
date_format=self.date_format, | ||
quoting=self.quoting) | ||
|
||
lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) | ||
|
||
def _bytes_to_str(self, values): | ||
"""Modify values list by decoding bytes to str.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so I think if we have mixed bytes & strings then we should simply raise a ValueError. you can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried using |
||
if self.bytes_encoding: | ||
for ii, value in enumerate(values): | ||
if isinstance(value, bytes): | ||
values[ii] = value.decode(self.bytes_encoding) | ||
|
||
|
||
# from collections import namedtuple | ||
# ExcelCell = namedtuple("ExcelCell", | ||
# 'row, col, val, style, mergestart, mergeend') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1580,12 +1580,15 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): | |
result = _trim_front(format_array(values, None, justify='left')) | ||
return header + result | ||
|
||
def to_native_types(self, slicer=None, **kwargs): | ||
def to_native_types(self, slicer=None, bytes_encoding=None, **kwargs): | ||
""" slice and dice then format """ | ||
values = self | ||
if slicer is not None: | ||
values = values[slicer] | ||
return values._format_native_types(**kwargs) | ||
result = values._format_native_types(**kwargs) | ||
if bytes_encoding is not None and result.dtype == object: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
lib.object_array_decode_bytes(result, bytes_encoding) | ||
return result | ||
|
||
def _format_native_types(self, na_rep='', quoting=None, **kwargs): | ||
""" actually format my specific types """ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1053,6 +1053,25 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re | |
|
||
return arr | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def object_array_decode_bytes(ndarray[object, ndim=1] arr, object encoding): | ||
"""Decode any instances of bytes to str in arr using the given encoding.""" | ||
if bytes == str: # in Python 2 these are the same and nothing needs to be done | ||
return | ||
|
||
cdef int length = arr.shape[0], i = 0 | ||
for i from 0 <= i < length: | ||
if isinstance(arr[i], bytes): | ||
arr[i] = arr[i].decode(encoding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again instead of writing this, simply use We have a tremendous amount of code already in pandas. So we really really try to avoid duplication. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. furthermore that has been optimized to use certain codecs which python is apparently very fast at. |
||
elif isinstance(arr[i], tuple): | ||
mask = [isinstance(it, bytes) for it in arr[i]] | ||
if any(mask): | ||
val = [it.decode(encoding) if mask[j] else it for j, it in enumerate(arr[i])] | ||
arr[i] = tuple(val) | ||
|
||
return arr | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move this to
Block
, whereto_native_types
already exists, you just need to incorporate this argument after stringifying (we have another conditional if quoting or not, but and am not sure if you need to do this or not for that one as I think the csv writer handles that).around line 590.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I put it in
ObjectBlock
because that's the only class that could contain objects--correct? Given that a precondition of the code I'm adding isdtype==object
, are you sure you'd want the logic to go in the base class for all types (even if it's overridden by some)?