Merge remote-tracking branch 'upstream/master' into sparse-frame-acce…

…ssor
TomAugspurger · Mar 12, 2019 · 534a379 · 534a379
2 parents 94a7baf + 5c341dc
commit 534a379
Show file tree

Hide file tree

Showing 11 changed files with 90 additions and 145 deletions.
diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
@@ -18,7 +18,7 @@ including other versions of pandas.
 .. _whatsnew_0242.regressions:
 
 Fixed Regressions
-^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~
 
 - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`)
 - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`)
@@ -31,71 +31,30 @@ Fixed Regressions
 - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`)
 - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`)
 - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)
+- Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`).
 - Fixed pip installing from source into an environment without NumPy (:issue:`25193`)
 - Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`)
 
-.. _whatsnew_0242.enhancements:
-
-Enhancements
-^^^^^^^^^^^^
-
--
--
-
 .. _whatsnew_0242.bug_fixes:
 
 Bug Fixes
 ~~~~~~~~~
 
-**Conversion**
-
--
--
--
-
-**Indexing**
-
--
--
--
-
 **I/O**
 
 - Better handling of terminal printing when the terminal dimensions are not known (:issue:`25080`)
 - Bug in reading a HDF5 table-format ``DataFrame`` created in Python 2, in Python 3 (:issue:`24925`)
 - Bug in reading a JSON with ``orient='table'`` generated by :meth:`DataFrame.to_json` with ``index=False`` (:issue:`25170`)
 - Bug where float indexes could have misaligned values when printing (:issue:`25061`)
--
-
-**Categorical**
-
--
--
--
-
-**Timezones**
-
--
--
--
-
-**Timedelta**
-
--
--
--
 
 **Reshaping**
 
 - Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`)
 - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`)
--
 
 **Visualization**
 
 - Bug in :meth:`Series.plot` where a secondary y axis could not be set to log scale (:issue:`25545`)
--
--
 
 **Other**
 

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -216,6 +216,7 @@ I/O
 - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`)
 - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`)
 - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
+- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
 -
 -
 

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -246,9 +246,11 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'):
     return iresult.base  # .base to access underlying np.ndarray
 
 
-cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
-    """ return a casting of the unit represented to nanoseconds
-        round the fractional part of a float to our precision, p """
+cpdef inline object precision_from_unit(object unit):
+    """
+    Return a casting of the unit represented to nanoseconds + the precision
+    to round the fractional part.
+    """
     cdef:
         int64_t m
         int p
@@ -285,6 +287,17 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
         p = 0
     else:
         raise ValueError("cannot cast unit {unit}".format(unit=unit))
+    return m, p
+
+
+cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
+    """ return a casting of the unit represented to nanoseconds
+        round the fractional part of a float to our precision, p """
+    cdef:
+        int64_t m
+        int p
+
+    m, p = precision_from_unit(unit)
 
     # just give me the unit back
     if ts is None:

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -11,7 +11,7 @@
 from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
 from pandas._libs.tslibs.fields import get_timedelta_field
 from pandas._libs.tslibs.timedeltas import (
-    array_to_timedelta64, parse_timedelta_unit)
+    array_to_timedelta64, parse_timedelta_unit, precision_from_unit)
 import pandas.compat as compat
 from pandas.util._decorators import Appender
 
@@ -918,12 +918,15 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
         copy = copy and not copy_made
 
     elif is_float_dtype(data.dtype):
-        # treat as multiples of the given unit.  If after converting to nanos,
-        #  there are fractional components left, these are truncated
-        #  (i.e. NOT rounded)
+        # cast the unit, multiply base/frace separately
+        # to avoid precision issues from float -> int
         mask = np.isnan(data)
-        coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns')
-        data = (coeff * data).astype(np.int64).view('timedelta64[ns]')
+        m, p = precision_from_unit(unit)
+        base = data.astype(np.int64)
+        frac = data - base
+        if p:
+            frac = np.round(frac, p)
+        data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]')
         data[mask] = iNaT
         copy = False
 

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -282,10 +282,10 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
         # consolidate data & figure out what our result ndim is going to be
         ndims = set()
         for obj in objs:
-            if not isinstance(obj, NDFrame):
-                msg = ('cannot concatenate object of type "{0}";'
-                       ' only pd.Series, pd.DataFrame, and pd.Panel'
-                       ' (deprecated) objs are valid'.format(type(obj)))
+            if not isinstance(obj, (Series, DataFrame)):
+                msg = ("cannot concatenate object of type '{}';"
+                       ' only Series and DataFrame objs are valid'
+                       .format(type(obj)))
                 raise TypeError(msg)
 
             # consolidate

diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -528,6 +528,10 @@ def _to_str_columns(self):
             else:
                 str_columns = self._get_formatted_column_labels(frame)
 
+            if self.show_row_idx_names:
+                for x in str_columns:
+                    x.append('')
+
             stringified = []
             for i, c in enumerate(frame):
                 cheader = str_columns[i]
@@ -770,11 +774,6 @@ def space_format(x, y):
                             need_leadsp[x] else x]
                            for i, (col, x) in enumerate(zip(columns,
                                                             fmt_columns))]
-
-        if self.show_row_idx_names:
-            for x in str_columns:
-                x.append('')
-
         # self.str_columns = str_columns
         return str_columns
 

diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
@@ -23,16 +23,13 @@
 from pandas.core.computation.expressions import (
     _NUMEXPR_INSTALLED, _USE_NUMEXPR)
 from pandas.core.computation.ops import (
-    _arith_ops_syms, _binary_math_ops, _binary_ops_dict, _bool_ops_syms,
+    _arith_ops_syms, _binary_math_ops, _binary_ops_dict,
     _special_case_arith_ops_syms, _unary_math_ops)
 import pandas.util.testing as tm
 from pandas.util.testing import (
     assert_frame_equal, assert_numpy_array_equal, assert_produces_warning,
     assert_series_equal, makeCustomDataframe as mkdf, randbool)
 
-_series_frame_incompatible = _bool_ops_syms
-_scalar_skip = 'in', 'not in'
-
 
 @pytest.fixture(params=(
     pytest.param(engine,
@@ -162,13 +159,21 @@ def teardown_method(self, method):
         del self.pandas_rhses, self.pandas_lhses, self.current_engines
 
     @pytest.mark.slow
-    def test_complex_cmp_ops(self):
-        cmp_ops = ('!=', '==', '<=', '>=', '<', '>')
-        cmp2_ops = ('>', '<')
-        for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, cmp_ops,
-                                                   self.rhses, self.bin_ops,
-                                                   cmp2_ops):
-            self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2)
+    @pytest.mark.parametrize('cmp1', ['!=', '==', '<=', '>=', '<', '>'],
+                             ids=['ne', 'eq', 'le', 'ge', 'lt', 'gt'])
+    @pytest.mark.parametrize('cmp2', ['>', '<'], ids=['gt', 'lt'])
+    def test_complex_cmp_ops(self, cmp1, cmp2):
+        for lhs, rhs, binop in product(
+                self.lhses, self.rhses, self.bin_ops):
+            lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine)
+            rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine)
+            expected = _eval_single_bin(
+                lhs_new, binop, rhs_new, self.engine)
+
+            ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(
+                cmp1=cmp1, binop=binop, cmp2=cmp2)
+            result = pd.eval(ex, engine=self.engine, parser=self.parser)
+            self.check_equal(result, expected)
 
     def test_simple_cmp_ops(self):
         bool_lhses = (DataFrame(randbool(size=(10, 5))),
@@ -225,41 +230,6 @@ def check_equal(self, result, expected):
         else:
             assert result == expected
 
-    def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2):
-        skip_these = _scalar_skip
-        ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1,
-                                                                binop=binop,
-                                                                cmp2=cmp2)
-        scalar_with_in_notin = (is_scalar(rhs) and (cmp1 in skip_these or
-                                                    cmp2 in skip_these))
-        if scalar_with_in_notin:
-            with pytest.raises(TypeError):
-                pd.eval(ex, engine=self.engine, parser=self.parser)
-            with pytest.raises(TypeError):
-                pd.eval(ex, engine=self.engine, parser=self.parser,
-                        local_dict={'lhs': lhs, 'rhs': rhs})
-        else:
-            lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine)
-            rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine)
-            if (isinstance(lhs_new, Series) and
-                    isinstance(rhs_new, DataFrame) and
-                    binop in _series_frame_incompatible):
-                pass
-                # TODO: the code below should be added back when left and right
-                # hand side bool ops are fixed.
-                #
-                # try:
-                #     pytest.raises(Exception, pd.eval, ex,
-                #                   local_dict={'lhs': lhs, 'rhs': rhs},
-                #                   engine=self.engine, parser=self.parser)
-                # except AssertionError:
-                #     raise
-            else:
-                expected = _eval_single_bin(
-                    lhs_new, binop, rhs_new, self.engine)
-                result = pd.eval(ex, engine=self.engine, parser=self.parser)
-                self.check_equal(result, expected)
-
     def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs):
 
         def check_operands(left, right, cmp_op):

diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py
@@ -181,3 +181,10 @@ def test_to_timedelta_on_missing_values(self):
 
         actual = pd.to_timedelta(pd.NaT)
         assert actual.value == timedelta_NaT.astype('int64')
+
+    def test_to_timedelta_float(self):
+        # https://github.com/pandas-dev/pandas/issues/25077
+        arr = np.arange(0, 1, 1e-6)[-10:]
+        result = pd.to_timedelta(arr, unit='s')
+        expected_asi8 = np.arange(999990000, int(1e9), 1000, dtype='int64')
+        tm.assert_numpy_array_equal(result.asi8, expected_asi8)
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
@@ -2380,6 +2380,14 @@ def test_to_string_header(self):
         exp = '0    0\n    ..\n9    9'
         assert res == exp
 
+    def test_to_string_multindex_header(self):
+        # GH 16718
+        df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]})
+              .set_index(['a', 'b']))
+        res = df.to_string(header=['r1', 'r2'])
+        exp = '    r1 r2\na b      \n0 1  2  3'
+        assert res == exp
+
 
 def _three_digit_exp():
     return '{x:.4g}'.format(x=1.7e8) == '1.7e+008'

diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py
@@ -735,3 +735,19 @@ def test_to_latex_float_format_no_fixed_width(self):
 \end{tabular}
 """
         assert df.to_latex(float_format='%.0f') == expected
+
+    def test_to_latex_multindex_header(self):
+        # GH 16718
+        df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]})
+              .set_index(['a', 'b']))
+        observed = df.to_latex(header=['r1', 'r2'])
+        expected = r"""\begin{tabular}{llrr}
+\toprule
+  &   & r1 & r2 \\
+a & b &    &    \\
+\midrule
+0 & 1 &  2 &  3 \\
+\bottomrule
+\end{tabular}
+"""
+        assert observed == expected