diff --git a/pandas/core/format.py b/pandas/core/format.py index 13e504a8e1f88..f2999c63db38e 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -36,7 +36,7 @@ string representation of NAN to use, default 'NaN' formatters : list or dict of one-parameter functions, optional formatter functions to apply to columns' elements by position or name, - default None + default None, if the result is a string , it must be a unicode string. float_format : one-parameter function, optional formatter function to apply to columns' elements if they are floats default None @@ -62,7 +62,7 @@ class SeriesFormatter(object): def __init__(self, series, buf=None, header=True, length=True, na_rep='NaN', name=False, float_format=None): self.series = series - self.buf = buf if buf is not None else StringIO() + self.buf = buf if buf is not None else StringIO(u"") self.name = name self.na_rep = na_rep self.length = length @@ -112,7 +112,7 @@ def to_string(self): series = self.series if len(series) == 0: - return '' + return u'' fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() @@ -135,9 +135,7 @@ def to_string(self): if footer: result.append(footer) - if py3compat.PY3: - return unicode(u'\n'.join(result)) - return com.console_encode(u'\n'.join(result)) + return unicode(u'\n'.join(result)) if py3compat.PY3: # pragma: no cover _encode_diff = lambda x: 0 @@ -200,10 +198,15 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, else: self.columns = frame.columns - def _to_str_columns(self, force_unicode=False): + def _to_str_columns(self, force_unicode=None): """ Render a DataFrame to a list of columns (as lists of strings). """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) + # may include levels names also str_index = self._get_formatted_index() str_columns = self._get_formatted_column_labels() @@ -237,32 +240,17 @@ def _to_str_columns(self, force_unicode=False): if self.index: strcols.insert(0, str_index) - if not py3compat.PY3: - if force_unicode: - def make_unicode(x): - if isinstance(x, unicode): - return x - return x.decode('utf-8') - strcols = map(lambda col: map(make_unicode, col), strcols) - else: - # Generally everything is plain strings, which has ascii - # encoding. Problem is when there is a char with value over - # 127. Everything then gets converted to unicode. - try: - map(lambda col: map(str, col), strcols) - except UnicodeError: - def make_unicode(x): - if isinstance(x, unicode): - return x - return x.decode('utf-8') - strcols = map(lambda col: map(make_unicode, col), strcols) - return strcols - def to_string(self, force_unicode=False): + def to_string(self, force_unicode=None): """ Render a DataFrame to a console-friendly tabular output. """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) + frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: @@ -272,15 +260,20 @@ def to_string(self, force_unicode=False): com.pprint_thing(frame.index))) text = info_line else: - strcols = self._to_str_columns(force_unicode) + strcols = self._to_str_columns() text = adjoin(1, *strcols) self.buf.writelines(text) - def to_latex(self, force_unicode=False, column_format=None): + def to_latex(self, force_unicode=None, column_format=None): """ Render a DataFrame to a LaTeX tabular environment output. """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) + frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: @@ -289,7 +282,7 @@ def to_latex(self, force_unicode=False, column_format=None): frame.columns, frame.index)) strcols = [[info_line]] else: - strcols = self._to_str_columns(force_unicode) + strcols = self._to_str_columns() if column_format is None: column_format = '|l|%s|' % '|'.join('c' for _ in strcols) @@ -726,18 +719,10 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN', self.justify = justify def get_result(self): - if self._have_unicode(): - fmt_values = self._format_strings(use_unicode=True) - else: - fmt_values = self._format_strings(use_unicode=False) - + fmt_values = self._format_strings() return _make_fixed_width(fmt_values, self.justify) - def _have_unicode(self): - mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode)) - return mask.any() - - def _format_strings(self, use_unicode=False): + def _format_strings(self): if self.float_format is None: float_format = print_config.float_format if float_format is None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f7f296e822e15..a160c994e94a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -612,20 +612,51 @@ def _need_info_repr_(self): else: return False - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): """ Return a string representation for a particular DataFrame + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - buf = StringIO() + buf = StringIO(u"") if self._need_info_repr_(): self.info(buf=buf, verbose=self._verbose_info) else: self.to_string(buf=buf) + value = buf.getvalue() + assert type(value) == unicode - if py3compat.PY3: - return unicode(value) - return com.console_encode(value) + return value + + def __repr__(self): + """ + Return a string representation for a particular DataFrame + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def _repr_html_(self): """ @@ -1379,19 +1410,21 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', def to_string(self, buf=None, columns=None, col_space=None, colSpace=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, nanRep=None, - index_names=True, justify=None, force_unicode=False): + index_names=True, justify=None, force_unicode=None): """ Render a DataFrame to a console-friendly tabular output. """ + import warnings + if force_unicode is not None: # pragma: no cover + warnings.warn("force_unicode is deprecated, it will have no effect", + FutureWarning) if nanRep is not None: # pragma: no cover - import warnings warnings.warn("nanRep is deprecated, use na_rep", FutureWarning) na_rep = nanRep if colSpace is not None: # pragma: no cover - import warnings warnings.warn("colSpace is deprecated, use col_space", FutureWarning) col_space = colSpace @@ -1404,15 +1437,10 @@ def to_string(self, buf=None, columns=None, col_space=None, colSpace=None, justify=justify, index_names=index_names, header=header, index=index) - formatter.to_string(force_unicode=force_unicode) + formatter.to_string() if buf is None: result = formatter.buf.getvalue() - if not force_unicode: - try: - result = str(result) - except ValueError: - pass return result @Appender(fmt.docstring_to_string, indents=1) diff --git a/pandas/core/index.py b/pandas/core/index.py index b7792309f66ff..133449d79d521 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -132,12 +132,48 @@ def __array_finalize__(self, obj): def _shallow_copy(self): return self.view() - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular Index + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + if py3compat.PY3: - prepr = com.pprint_thing(self) + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular Index + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular Index + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ + if len(self) > 6 and len(self) > np.get_printoptions()['threshold']: + data = self[:3].tolist() + ["..."] + self[-3:].tolist() else: - prepr = com.pprint_thing_encoded(self) - return 'Index(%s, dtype=%s)' % (prepr, self.dtype) + data = self + + prepr = com.pprint_thing(data) + return '%s(%s, dtype=%s)' % (type(self).__name__, prepr, self.dtype) + + def __repr__(self): + """ + Return a string representation for a particular Index + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def astype(self, dtype): return Index(self.values.astype(dtype), name=self.name, @@ -207,15 +243,6 @@ def summary(self, name=None): name = type(self).__name__ return '%s: %s entries%s' % (name, len(self), index_summary) - def __str__(self): - try: - return np.array_repr(self.values) - except UnicodeError: - converted = u','.join(com.pprint_thing(x) for x in self.values) - result = u'%s([%s], dtype=''%s'')' % (type(self).__name__, converted, - str(self.values.dtype)) - return com.console_encode(result) - def _mpl_repr(self): # how to represent ourselves to matplotlib return self.values @@ -394,8 +421,8 @@ def format(self, name=False): result = [] for dt in self: if dt.time() != zero_time or dt.tzinfo is not None: - return header + ['%s' % x for x in self] - result.append('%d-%.2d-%.2d' % (dt.year, dt.month, dt.day)) + return header + [u'%s' % x for x in self] + result.append(u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day)) return header + result values = self.values @@ -1319,7 +1346,33 @@ def _array_values(self): def dtype(self): return np.dtype('O') - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular Index + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular Index + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular Index + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ output = 'MultiIndex\n%s' options = np.get_printoptions() @@ -1335,10 +1388,15 @@ def __repr__(self): np.set_printoptions(threshold=options['threshold']) - if py3compat.PY3: - return output % summary - else: - return com.console_encode(output % summary) + return output % summary + + def __repr__(self): + """ + Return a string representation for a particular Index + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def __len__(self): return len(self.labels[0]) @@ -1496,7 +1554,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, formatted = lev.take(lab).format() else: # weird all NA case - formatted = [str(x) for x in com.take_1d(lev.values, lab)] + formatted = [com.pprint_thing(x) for x in com.take_1d(lev.values, lab)] stringified_levels.append(formatted) result_levels = [] diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 2dca8a2aef801..ae4a5d868b139 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -386,34 +386,70 @@ def __array_wrap__(self, result): #---------------------------------------------------------------------- # Magic methods - def __repr__(self): + def __str__(self): + """ + Return a string representation for a particular Panel + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular Panel + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular Panel + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ + class_name = str(self.__class__) I, N, K = len(self.items), len(self.major_axis), len(self.minor_axis) - dims = 'Dimensions: %d (items) x %d (major) x %d (minor)' % (I, N, K) + dims = u'Dimensions: %d (items) x %d (major) x %d (minor)' % (I, N, K) if len(self.major_axis) > 0: - major = 'Major axis: %s to %s' % (self.major_axis[0], + major = u'Major axis: %s to %s' % (self.major_axis[0], self.major_axis[-1]) else: - major = 'Major axis: None' + major = u'Major axis: None' if len(self.minor_axis) > 0: - minor = 'Minor axis: %s to %s' % (self.minor_axis[0], - self.minor_axis[-1]) + minor = u'Minor axis: %s to %s' % (com.pprint_thing(self.minor_axis[0]), + com.pprint_thing(self.minor_axis[-1])) else: - minor = 'Minor axis: None' + minor = u'Minor axis: None' if len(self.items) > 0: - items = 'Items: %s to %s' % (self.items[0], self.items[-1]) + items = u'Items: %s to %s' % (com.pprint_thing(self.items[0]), + com.pprint_thing(self.items[-1])) else: - items = 'Items: None' + items = u'Items: None' - output = '%s\n%s\n%s\n%s\n%s' % (class_name, dims, items, major, minor) + output = u'%s\n%s\n%s\n%s\n%s' % (class_name, dims, items, major, minor) return output + def __repr__(self): + """ + Return a string representation for a particular Panel + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) + def __iter__(self): return iter(self.items) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3241044a63c68..dc7588847775b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -858,8 +858,34 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): return df.reset_index(level=level, drop=drop) - def __repr__(self): - """Clean string representation of a Series""" + + def __str__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if py3compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by bytes(df) in py3 only. + Yields a bytestring in both py2/py3. + """ + return com.console_encode(self.__unicode__()) + + def __unicode__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. + """ width, height = get_terminal_size() max_rows = (height if fmt.print_config.max_rows == 0 else fmt.print_config.max_rows) @@ -870,13 +896,24 @@ def __repr__(self): length=len(self) > 50, name=True) else: - result = '%s' % ndarray.__repr__(self) + result = com.pprint_thing(self) - if py3compat.PY3: - return unicode(result) - return com.console_encode(result) + assert type(result) == unicode + return result + + def __repr__(self): + """ + Return a string representation for a particular Series + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) def _tidy_repr(self, max_vals=20): + """ + + Internal function, should always return unicode string + """ num = max_vals // 2 head = self[:num]._get_repr(print_header=True, length=False, name=False) @@ -884,11 +921,13 @@ def _tidy_repr(self, max_vals=20): length=False, name=False) result = head + '\n...\n' + tail - return '%s\n%s' % (result, self._repr_footer()) + result = '%s\n%s' % (result, self._repr_footer()) + + return unicode(result) def _repr_footer(self): - namestr = "Name: %s, " % com.pprint_thing(self.name) if self.name is not None else "" - return '%sLength: %d' % (namestr, len(self)) + namestr = u"Name: %s, " % com.pprint_thing(self.name) if self.name is not None else "" + return u'%sLength: %d' % (namestr, len(self)) def to_string(self, buf=None, na_rep='NaN', float_format=None, nanRep=None, length=False, name=False): @@ -921,6 +960,9 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, the_repr = self._get_repr(float_format=float_format, na_rep=na_rep, length=length, name=name) + + assert type(the_repr) == unicode + if buf is None: return the_repr else: @@ -928,13 +970,17 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, def _get_repr(self, name=False, print_header=False, length=True, na_rep='NaN', float_format=None): + """ + + Internal function, should always return unicode string + """ + formatter = fmt.SeriesFormatter(self, name=name, header=print_header, length=length, na_rep=na_rep, float_format=float_format) - return formatter.to_string() - - def __str__(self): - return repr(self) + result = formatter.to_string() + assert type(result) == unicode + return result def __iter__(self): if np.issubdtype(self.dtype, np.datetime64): diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 10bb75bfbb5b6..0b5182acb7f72 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -135,7 +135,7 @@ def test_to_string_unicode_columns(self): df.info(buf=buf) buf.getvalue() - result = self.frame.to_string(force_unicode=True) + result = self.frame.to_string() self.assert_(isinstance(result, unicode)) def test_to_string_unicode_two(self): @@ -495,7 +495,6 @@ def test_to_string_int_formatting(self): self.assert_(issubclass(df['x'].dtype.type, np.integer)) output = df.to_string() - self.assert_(isinstance(output, str)) expected = (' x\n' '0 -15\n' '1 20\n' @@ -841,16 +840,16 @@ def test_to_string(self): def test_to_string_mixed(self): s = Series(['foo', np.nan, -1.23, 4.56]) result = s.to_string() - expected = ('0 foo\n' - '1 NaN\n' - '2 -1.23\n' - '3 4.56') + expected = (u'0 foo\n' + u'1 NaN\n' + u'2 -1.23\n' + u'3 4.56') self.assertEqual(result, expected) # but don't count NAs as floats s = Series(['foo', np.nan, 'bar', 'baz']) result = s.to_string() - expected = ('0 foo\n' + expected = (u'0 foo\n' '1 NaN\n' '2 bar\n' '3 baz') @@ -858,7 +857,7 @@ def test_to_string_mixed(self): s = Series(['foo', 5, 'bar', 'baz']) result = s.to_string() - expected = ('0 foo\n' + expected = (u'0 foo\n' '1 5\n' '2 bar\n' '3 baz') @@ -869,7 +868,7 @@ def test_to_string_float_na_spacing(self): s[::2] = np.nan result = s.to_string() - expected = ('0 NaN\n' + expected = (u'0 NaN\n' '1 1.5678\n' '2 NaN\n' '3 -3.0000\n' diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index fea84f5a86e36..4eb1be94e0846 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -27,6 +27,7 @@ from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal) +from pandas.util import py3compat import pandas.util.testing as tm import pandas.lib as lib @@ -2916,6 +2917,21 @@ def test_repr_unicode(self): result = repr(df) self.assertEqual(result.split('\n')[0].rstrip(), ex_top) + def test_unicode_string_with_unicode(self): + df = DataFrame({'A': [u"\u05d0"]}) + + if py3compat.PY3: + str(df) + else: + unicode(df) + + def test_bytestring_with_unicode(self): + df = DataFrame({'A': [u"\u05d0"]}) + if py3compat.PY3: + bytes(df) + else: + str(df) + def test_very_wide_info_repr(self): df = DataFrame(np.random.randn(10, 20), columns=[tm.rands(10) for _ in xrange(20)]) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index b94840d0dfd85..4a86db9d67196 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -851,6 +851,21 @@ def test_print_unicode_columns(self): df=pd.DataFrame({u"\u05d0":[1,2,3],"\u05d1":[4,5,6],"c":[7,8,9]}) print(df.columns) # should not raise UnicodeDecodeError + def test_unicode_string_with_unicode(self): + idx = Index(range(1000)) + + if py3compat.PY3: + str(idx) + else: + unicode(idx) + + def test_bytestring_with_unicode(self): + idx = Index(range(1000)) + if py3compat.PY3: + bytes(idx) + else: + str(idx) + class TestMultiIndex(unittest.TestCase): def setUp(self): @@ -1680,6 +1695,24 @@ def test_repr_with_unicode_data(self): index=pd.DataFrame(d).set_index(["a","b"]).index self.assertFalse("\\u" in repr(index)) # we don't want unicode-escaped + def test_unicode_string_with_unicode(self): + d={"a":[u"\u05d0",2,3],"b":[4,5,6],"c":[7,8,9]} + idx=pd.DataFrame(d).set_index(["a","b"]).index + + if py3compat.PY3: + str(idx) + else: + unicode(idx) + + def test_bytestring_with_unicode(self): + d={"a":[u"\u05d0",2,3],"b":[4,5,6],"c":[7,8,9]} + idx=pd.DataFrame(d).set_index(["a","b"]).index + + if py3compat.PY3: + bytes(idx) + else: + str(idx) + def test_get_combined_index(): from pandas.core.index import _get_combined_index result = _get_combined_index([]) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index a906489e67b57..96de4784fdc99 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1043,6 +1043,11 @@ def test_repr(self): rep_str = repr(ser) self.assert_("Name: 0" in rep_str) + def test_tidy_repr(self): + a=Series([u"\u05d0"]*1000) + a.name= 'title1' + repr(a) # should not raise exception + def test_repr_bool_fails(self): s = Series([DataFrame(np.random.randn(2,2)) for i in range(5)]) @@ -1078,6 +1083,22 @@ def test_repr_should_return_str (self): df=Series(data,index=index1) self.assertTrue(type(df.__repr__() == str)) # both py2 / 3 + + def test_unicode_string_with_unicode(self): + df = Series([u"\u05d0"],name=u"\u05d1") + if py3compat.PY3: + str(df) + else: + unicode(df) + + def test_bytestring_with_unicode(self): + df = Series([u"\u05d0"],name=u"\u05d1") + if py3compat.PY3: + bytes(df) + else: + str(df) + + def test_timeseries_repr_object_dtype(self): index = Index([datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object)