Merge branch 'main' into do-not-error-on-other-dbapi2-connections

pandas-dev · Jan 31, 2022 · 9eda931 · 9eda931
2 parents 0fa71e1 + bcf0af0
commit 9eda931
Show file tree

Hide file tree

Showing 45 changed files with 432 additions and 323 deletions.
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
diff --git a/doc/source/whatsnew/v1.4.1.rst b/doc/source/whatsnew/v1.4.1.rst
@@ -15,8 +15,10 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Regression in :meth:`Series.mask` with ``inplace=True`` and ``PeriodDtype`` and an incompatible ``other`` coercing to a common dtype instead of raising (:issue:`45546`)
+- Regression in :func:`.assert_frame_equal` not respecting ``check_flags=False`` (:issue:`45554`)
+- Regression in :meth:`Series.fillna` with ``downcast=False`` incorrectly downcasting ``object`` dtype (:issue:`45603`)
 - Regression in :meth:`DataFrame.loc.__setitem__` losing :class:`Index` name if :class:`DataFrame` was empty before (:issue:`45621`)
--
+- Regression in :func:`pandasSQL_builder` whereby `sqlalchemy` was incorrectly required as a dependency for DBAPI2 connection objects that aren't from `sqlite3` or `sqlalchemy`  (:issue:`45660`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -234,6 +234,7 @@ Timezones
 Numeric
 ^^^^^^^
 - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
+- Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an arraylike with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
 -
 
 Conversion
@@ -259,12 +260,15 @@ Indexing
 ^^^^^^^^
 - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`)
 - Bug in :meth:`DataFrame.iloc` where indexing a single row on a :class:`DataFrame` with a single ExtensionDtype column gave a copy instead of a view on the underlying data (:issue:`45241`)
+- Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`45568`)
 - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised insead of casting to a common dtype (:issue:`45070`)
 - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`)
 - Bug in :meth:`loc.__setitem__` treating ``range`` keys as positional instead of label-based (:issue:`45479`)
 - Bug in :meth:`Series.__setitem__` when setting ``boolean`` dtype values containing ``NA`` incorrectly raising instead of casting to ``boolean`` dtype (:issue:`45462`)
 - Bug in :meth:`Series.__setitem__` where setting :attr:`NA` into a numeric-dtpye :class:`Series` would incorrectly upcast to object-dtype rather than treating the value as ``np.nan`` (:issue:`44199`)
+- Bug in :meth:`DataFrame.mask` with ``inplace=True`` and ``ExtensionDtype`` columns incorrectly raising (:issue:`45577`)
 - Bug in getting a column from a DataFrame with an object-dtype row index with datetime-like values: the resulting Series now preserves the exact object-dtype Index from the parent DataFrame (:issue:`42950`)
+- Bug in indexing on a :class:`DatetimeIndex` with a ``np.str_`` key incorrectly raising (:issue:`45580`)
 -
 
 Missing
@@ -305,6 +309,7 @@ Groupby/resample/rolling
 Reshaping
 ^^^^^^^^^
 - Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`45359`)
+- Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`)
 -
 
 Sparse

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -291,7 +291,7 @@ def parse_datetime_string(
     return dt
 
 
-def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None):
+def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
     """
     Try hard to parse datetime string, leveraging dateutil plus some extra
     goodies like quarter recognition.
@@ -312,6 +312,16 @@ def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None):
     str
         Describing resolution of parsed string.
     """
+    if type(arg) is not str:
+        # GH#45580 np.str_ satisfies isinstance(obj, str) but if we annotate
+        #  arg as "str" this raises here
+        if not isinstance(arg, np.str_):
+            raise TypeError(
+                "Argument 'arg' has incorrect type "
+                f"(expected str, got {type(arg).__name__})"
+            )
+        arg = str(arg)
+
     if is_offset_object(freq):
         freq = freq.rule_code
 

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -1344,6 +1344,7 @@ def assert_frame_equal(
                 rtol=rtol,
                 atol=atol,
                 check_index=False,
+                check_flags=False,
             )
 
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -631,6 +631,10 @@ def factorize(
     cut : Discretize continuous-valued array.
     unique : Find the unique value in an array.
 
+    Notes
+    -----
+    Reference :ref:`the user guide <reshaping.factorize>` for more examples.
+
     Examples
     --------
     These examples all show factorize as a top-level method like

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -349,11 +349,10 @@ def _coerce_to_array(
     def _logical_method(self, other, op):
 
         assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
-        other_is_booleanarray = isinstance(other, BooleanArray)
         other_is_scalar = lib.is_scalar(other)
         mask = None
 
-        if other_is_booleanarray:
+        if isinstance(other, BooleanArray):
             other, mask = other._data, other._mask
         elif is_list_like(other):
             other = np.asarray(other, dtype="bool")
@@ -370,7 +369,7 @@ def _logical_method(self, other, op):
             )
 
         if not other_is_scalar and len(self) != len(other):
-            raise ValueError("Lengths must match to compare")
+            raise ValueError("Lengths must match")
 
         if op.__name__ in {"or_", "ror_"}:
             result, mask = ops.kleene_or(self._data, other, self._mask, mask)
@@ -387,7 +386,7 @@ def _arith_method(self, other, op):
         mask = None
         op_name = op.__name__
 
-        if isinstance(other, BooleanArray):
+        if isinstance(other, BaseMaskedArray):
             other, mask = other._data, other._mask
 
         elif is_list_like(other):
@@ -397,14 +396,7 @@ def _arith_method(self, other, op):
             if len(self) != len(other):
                 raise ValueError("Lengths must match")
 
-        # nans propagate
-        if mask is None:
-            mask = self._mask
-            if other is libmissing.NA:
-                # GH#45421 don't alter inplace
-                mask = mask | True
-        else:
-            mask = self._mask | mask
+        mask = self._propagate_mask(mask, other)
 
         if other is libmissing.NA:
             # if other is NA, the result will be all NA and we can't run the
@@ -425,14 +417,6 @@ def _arith_method(self, other, op):
             with np.errstate(all="ignore"):
                 result = op(self._data, other)
 
-        # divmod returns a tuple
-        if op_name == "divmod":
-            div, mod = result
-            return (
-                self._maybe_mask_result(div, mask, other, "floordiv"),
-                self._maybe_mask_result(mod, mask, other, "mod"),
-            )
-
         return self._maybe_mask_result(result, mask, other, op_name)
 
     def __abs__(self):

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -18,10 +18,7 @@
 
 from pandas._config import get_option
 
-from pandas._libs import (
-    NaT,
-    lib,
-)
+from pandas._libs import lib
 from pandas._libs.interval import (
     VALID_CLOSED,
     Interval,
@@ -44,8 +41,6 @@
 
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
-    is_datetime64_dtype,
-    is_datetime64tz_dtype,
     is_dtype_equal,
     is_float_dtype,
     is_integer_dtype,
@@ -54,7 +49,6 @@
     is_object_dtype,
     is_scalar,
     is_string_dtype,
-    is_timedelta64_dtype,
     needs_i8_conversion,
     pandas_dtype,
 )
@@ -1103,30 +1097,23 @@ def _validate_scalar(self, value):
             # TODO: check subdtype match like _validate_setitem_value?
         elif is_valid_na_for_dtype(value, self.left.dtype):
             # GH#18295
-            left = right = value
+            left = right = self.left._na_value
         else:
             raise TypeError(
                 "can only insert Interval objects and NA into an IntervalArray"
             )
         return left, right
 
     def _validate_setitem_value(self, value):
-        needs_float_conversion = False
 
         if is_valid_na_for_dtype(value, self.left.dtype):
             # na value: need special casing to set directly on numpy arrays
+            value = self.left._na_value
             if is_integer_dtype(self.dtype.subtype):
                 # can't set NaN on a numpy integer array
-                needs_float_conversion = True
-            elif is_datetime64_dtype(self.dtype.subtype):
-                # need proper NaT to set directly on the numpy array
-                value = np.datetime64("NaT")
-            elif is_datetime64tz_dtype(self.dtype.subtype):
-                # need proper NaT to set directly on the DatetimeArray array
-                value = NaT
-            elif is_timedelta64_dtype(self.dtype.subtype):
-                # need proper NaT to set directly on the numpy array
-                value = np.timedelta64("NaT")
+                # GH#45484 TypeError, not ValueError, matches what we get with
+                #  non-NA un-holdable value.
+                raise TypeError("Cannot set float NaN to integer-backed IntervalArray")
             value_left, value_right = value, value
 
         elif isinstance(value, Interval):
@@ -1139,10 +1126,6 @@ def _validate_setitem_value(self, value):
         else:
             return self._validate_listlike(value)
 
-        if needs_float_conversion:
-            # GH#45484 TypeError, not ValueError, matches what we get with
-            #  non-NA un-holdable value.
-            raise TypeError("Cannot set float NaN to integer-backed IntervalArray")
         return value_left, value_right
 
     def value_counts(self, dropna: bool = True):

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -13,7 +13,6 @@
 import numpy as np
 
 from pandas._libs import (
-    iNaT,
     lib,
     missing as libmissing,
 )
@@ -582,6 +581,18 @@ def _hasna(self) -> bool:
         # error: Incompatible return value type (got "bool_", expected "bool")
         return self._mask.any()  # type: ignore[return-value]
 
+    def _propagate_mask(
+        self, mask: npt.NDArray[np.bool_] | None, other
+    ) -> npt.NDArray[np.bool_]:
+        if mask is None:
+            mask = self._mask.copy()  # TODO: need test for BooleanArray needing a copy
+            if other is libmissing.NA:
+                # GH#45421 don't alter inplace
+                mask = mask | True
+        else:
+            mask = self._mask | mask
+        return mask
+
     def _cmp_method(self, other, op) -> BooleanArray:
         from pandas.core.arrays import BooleanArray
 
@@ -619,12 +630,7 @@ def _cmp_method(self, other, op) -> BooleanArray:
                 if result is NotImplemented:
                     result = invalid_comparison(self._data, other, op)
 
-        # nans propagate
-        if mask is None:
-            mask = self._mask.copy()
-        else:
-            mask = self._mask | mask
-
+        mask = self._propagate_mask(mask, other)
         return BooleanArray(result, mask, copy=False)
 
     def _maybe_mask_result(self, result, mask, other, op_name: str):
@@ -636,6 +642,14 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
         other : scalar or array-like
         op_name : str
         """
+        if op_name == "divmod":
+            # divmod returns a tuple
+            div, mod = result
+            return (
+                self._maybe_mask_result(div, mask, other, "floordiv"),
+                self._maybe_mask_result(mod, mask, other, "mod"),
+            )
+
         # if we have a float operand we are by-definition
         # a float result
         # or our op is a divide
@@ -657,8 +671,11 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
             # e.g. test_numeric_arr_mul_tdscalar_numexpr_path
             from pandas.core.arrays import TimedeltaArray
 
-            result[mask] = iNaT
-            return TimedeltaArray._simple_new(result)
+            if not isinstance(result, TimedeltaArray):
+                result = TimedeltaArray._simple_new(result)
+
+            result[mask] = result.dtype.type("NaT")
+            return result
 
         elif is_integer_dtype(result):
             from pandas.core.arrays import IntegerArray