From 291ac7039de6a6e24de498fa493dcdec1482fb07 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Aug 2022 09:52:09 -0700 Subject: [PATCH 1/4] REF: remove axes from Managers --- pandas/_libs/properties.pyx | 5 +- pandas/core/arraylike.py | 6 ++ pandas/core/frame.py | 47 ++++++--- pandas/core/generic.py | 130 ++++++++++++++++++++----- pandas/core/groupby/generic.py | 10 +- pandas/core/groupby/groupby.py | 3 +- pandas/core/groupby/ops.py | 11 ++- pandas/core/internals/array_manager.py | 3 + pandas/core/internals/base.py | 5 +- pandas/core/internals/construction.py | 15 +-- pandas/core/internals/managers.py | 4 + pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/merge.py | 4 +- pandas/core/series.py | 16 ++- 14 files changed, 199 insertions(+), 62 deletions(-) diff --git a/pandas/_libs/properties.pyx b/pandas/_libs/properties.pyx index 3354290a5f535..e6cbfb2c11017 100644 --- a/pandas/_libs/properties.pyx +++ b/pandas/_libs/properties.pyx @@ -61,9 +61,10 @@ cdef class AxisProperty: if obj is None: # Only instances have _mgr, not classes return self + if self.axis == 0: + return obj._index else: - axes = obj._mgr.axes - return axes[self.axis] + return obj._columns def __set__(self, obj, value): obj._set_axis(self.axis, value) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 4e8e4ea7e8d87..996bfd776f981 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -358,6 +358,12 @@ def _reconstruct(result): return result if isinstance(result, BlockManager): # we went through BlockManager.apply e.g. np.sqrt + # TODO: any cases that aren't index/columns-preserving? + if self.ndim == 1: + reconstruct_kwargs["index"] = self.index + else: + reconstruct_kwargs["index"] = self.index + reconstruct_kwargs["columns"] = self.columns result = self._constructor(result, **reconstruct_kwargs, copy=False) else: # we converted an array, lost our axes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 22ccd1d763769..d8074ca18a8eb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -590,7 +590,7 @@ class DataFrame(NDFrame, OpsMixin): 2 2 3 """ - _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set + _internal_names_set = {"_columns", "columns", "_index", "index"} | NDFrame._internal_names_set _typ = "dataframe" _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) _accessors: set[str] = {"sparse"} @@ -621,11 +621,20 @@ def __init__( dtype = self._validate_dtype(dtype) if isinstance(data, DataFrame): + if index is None and columns is None: + index = data.index + columns = data.columns data = data._mgr if isinstance(data, (BlockManager, ArrayManager)): # first check if a Manager is passed without any other arguments # -> use fastpath (without checking Manager type) + if index is None or columns is None: + assert False + if not index.equals(data.axes[-1]):#index is not data.axes[-1]: + assert False + if not columns.equals(data.axes[0]):#columns is not data.axes[0]: + assert False if index is None and columns is None and dtype is None and not copy: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -751,7 +760,7 @@ def __init__( index, # type: ignore[arg-type] dtype, ) - mgr = arrays_to_mgr( + mgr, _, _ = arrays_to_mgr( arrays, columns, index, @@ -794,7 +803,7 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) + mgr, _, _ = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) else: arr2d = construct_2d_arraylike_from_scalar( data, @@ -2399,9 +2408,10 @@ def maybe_reorder( columns = columns.drop(exclude) manager = get_option("mode.data_manager") - mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) + mgr, index, columns = arrays_to_mgr(arrays, columns, result_index, typ=manager) - return cls(mgr) + # FIXME: get axes without mgr.axes + return cls(mgr, index=index, columns=columns) def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None @@ -2603,7 +2613,7 @@ def _from_arrays( columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(columns) must match len(arrays)") - mgr = arrays_to_mgr( + mgr, index, columns = arrays_to_mgr( arrays, columns, index, @@ -2611,7 +2621,7 @@ def _from_arrays( verify_integrity=verify_integrity, typ=manager, ) - return cls(mgr) + return cls(mgr, index=index, columns=columns) @doc( storage_options=_shared_docs["storage_options"], @@ -3729,7 +3739,7 @@ def _ixs(self, i: int, axis: int = 0) -> Series: # if we are a copy, mark as such copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None - result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__( + result = self._constructor_sliced(new_mgr, index=self.columns, name=self.index[i]).__finalize__( self ) result._set_is_copy(self, copy=copy) @@ -4267,7 +4277,7 @@ def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: name = self.columns[loc] klass = self._constructor_sliced # We get index=self.index bc values is a SingleDataManager - return klass(values, name=name, fastpath=True).__finalize__(self) + return klass(values, name=name, index=self.index, fastpath=True).__finalize__(self) # ---------------------------------------------------------------------- # Lookup Caching @@ -6942,8 +6952,12 @@ def sort_values( # type: ignore[override] new_data.set_axis( self._get_block_manager_axis(axis), default_index(len(indexer)) ) - - result = self._constructor(new_data) + # FIXME: get axes without mgr.axes + axes_dict = {} + axes_dict["index"] = new_data.axes[-1] + if self.ndim == 2: + axes_dict["columns"] = new_data.axes[0] + result = self._constructor(new_data, **axes_dict) if inplace: return self._update_inplace(result) else: @@ -7627,7 +7641,7 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): # i.e. scalar, faster than checking np.ndim(right) == 0 with np.errstate(all="ignore"): bm = self._mgr.apply(array_op, right=right) - return self._constructor(bm) + return self._constructor(bm, index=self.index, columns=self.columns) elif isinstance(right, DataFrame): assert self.index.equals(right.index) @@ -7648,7 +7662,7 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): right._mgr, # type: ignore[arg-type] array_op, ) - return self._constructor(bm) + return self._constructor(bm, index=self.index, columns=self.columns) elif isinstance(right, Series) and axis == 1: # axis=1 means we want to operate row-by-row @@ -10900,7 +10914,8 @@ def _get_data() -> DataFrame: # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) - out = df._constructor(res).iloc[0] + # FIXME: get axes without mgr.axes + out = df._constructor(res, index=res.axes[1], columns=res.axes[0]).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: @@ -11665,9 +11680,9 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: _info_axis_name = "columns" index = properties.AxisProperty( - axis=1, doc="The index (row labels) of the DataFrame." + axis=0, doc="The index (row labels) of the DataFrame." ) - columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.") + columns = properties.AxisProperty(axis=1, doc="The column labels of the DataFrame.") @property def _AXIS_NUMBERS(self) -> dict[str, int]: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aa9845a2abb78..f92c297c77293 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -276,6 +276,13 @@ def __init__( object.__setattr__(self, "_attrs", attrs) object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) + # FIXME: get axes without data.axes + if self.ndim == 1: + object.__setattr__(self, "_index", data.axes[0]) + else: + object.__setattr__(self, "_index", data.axes[1]) + object.__setattr__(self, "_columns", data.axes[0]) + @classmethod def _init_mgr( cls, @@ -820,8 +827,29 @@ def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool_t, copy: bool_t): def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None: labels = ensure_index(labels) - self._mgr.set_axis(axis, labels) + self._validate_set_axis(axis, labels) self._clear_item_cache() + if axis == 0: + object.__setattr__(self, "_index", labels) + else: + object.__setattr__(self, "_columns", labels) + + @final + def _validate_set_axis(self, axis: int, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + old_len = self.shape[axis] + new_len = len(new_labels) + + if axis == 1 and len(self.columns) == 0: + # If we are setting the index on a DataFrame with no columns, + # it is OK to change the length. + pass + + elif new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) @final def swapaxes( @@ -1495,7 +1523,8 @@ def blk_func(values: ArrayLike): return operator.neg(values) # type: ignore[arg-type] new_data = self._mgr.apply(blk_func) - res = self._constructor(new_data) + axes_dict = self._construct_axes_dict() + res = self._constructor(new_data, **axes_dict) return res.__finalize__(self, method="__neg__") @final @@ -1510,7 +1539,8 @@ def blk_func(values: ArrayLike): return operator.pos(values) # type: ignore[arg-type] new_data = self._mgr.apply(blk_func) - res = self._constructor(new_data) + axes_dict = self._construct_axes_dict() + res = self._constructor(new_data, **axes_dict) return res.__finalize__(self, method="__pos__") @final @@ -1520,7 +1550,8 @@ def __invert__(self: NDFrameT) -> NDFrameT: return self new_data = self._mgr.apply(operator.invert) - return self._constructor(new_data).__finalize__(self, method="__invert__") + axes_dict = self._construct_axes_dict() + return self._constructor(new_data, **axes_dict).__finalize__(self, method="__invert__") @final def __nonzero__(self) -> NoReturn: @@ -1647,7 +1678,8 @@ def abs(self: NDFrameT) -> NDFrameT: 3 7 40 -50 """ res_mgr = self._mgr.apply(np.abs) - return self._constructor(res_mgr).__finalize__(self, name="abs") + axes_dict = self._construct_axes_dict() + return self._constructor(res_mgr, **axes_dict).__finalize__(self, name="abs") @final def __abs__(self: NDFrameT) -> NDFrameT: @@ -3891,7 +3923,10 @@ def _take( verify=True, convert_indices=convert_indices, ) - return self._constructor(new_data).__finalize__(self, method="take") + axes_dict = self._construct_axes_dict() + #axes_dict[axis] = self.axes[axis].take(indices) # FIXME: get axes without mgr.axes + axes_dict[self._get_axis_name(axis)] = new_data.axes[self._get_block_manager_axis(axis)] + return self._constructor(new_data, **axes_dict).__finalize__(self, method="take") def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT: """ @@ -4103,8 +4138,16 @@ def _slice(self: NDFrameT, slobj: slice, axis=0) -> NDFrameT: Slicing with this method is *always* positional. """ assert isinstance(slobj, slice), type(slobj) - axis = self._get_block_manager_axis(axis) - result = self._constructor(self._mgr.get_slice(slobj, axis=axis)) + + axis_name = self._get_axis_name(axis) + new_idx = self.axes[axis][slobj] + axes_dict = self._construct_axes_dict() + axes_dict[axis_name] = new_idx + + bm_axis = self._get_block_manager_axis(axis) + new_mgr = self._mgr.get_slice(slobj, axis=bm_axis) + + result = self._constructor(new_mgr, **axes_dict) result = result.__finalize__(self) # this could be a view @@ -4595,7 +4638,12 @@ def _drop_axis( allow_dups=True, only_slice=only_slice, ) - result = self._constructor(new_mgr) + # FIXME: get axes without mgr.axes + axes_dict = {} + axes_dict["index"] = new_mgr.axes[-1] + if self.ndim == 2: + axes_dict["columns"] = new_mgr.axes[0] + result = self._constructor(new_mgr, **axes_dict) if self.ndim == 1: result.name = self.name @@ -5056,7 +5104,12 @@ def sort_index( axis = 1 if isinstance(self, ABCDataFrame) else 0 new_data.set_axis(axis, default_index(len(indexer))) - result = self._constructor(new_data) + axes_dict = {}#self._construct_axes_dict() + # FIXME: get axes without mgr.axes + axes_dict["index"] = new_data.axes[-1] + if self.ndim == 2: + axes_dict["columns"] = new_data.axes[0] + result = self._constructor(new_data, **axes_dict) if inplace: return self._update_inplace(result) @@ -5393,7 +5446,13 @@ def _reindex_with_indexers( if copy and new_data is self._mgr: new_data = new_data.copy() - return self._constructor(new_data).__finalize__(self) + # FIXME: get axes without mgr.axes + if self.ndim == 1: + axes_dict = {"index": new_data.axes[0]} + else: + axes_dict = {"index": new_data.axes[1], "columns": new_data.axes[0]} + + return self._constructor(new_data, **axes_dict).__finalize__(self) def filter( self: NDFrameT, @@ -6018,7 +6077,8 @@ def _consolidate(self): """ f = lambda: self._mgr.consolidate() cons_data = self._protect_consolidate(f) - return self._constructor(cons_data).__finalize__(self) + axes_dict = self._construct_axes_dict() + return self._constructor(cons_data, **axes_dict).__finalize__(self) @property def _is_mixed_type(self) -> bool_t: @@ -6050,11 +6110,23 @@ def _check_inplace_setting(self, value) -> bool_t: @final def _get_numeric_data(self: NDFrameT) -> NDFrameT: - return self._constructor(self._mgr.get_numeric_data()).__finalize__(self) + # FIXME: get axes without mgr.axes + mgr = self._mgr.get_numeric_data() + axes_dict = {} + axes_dict["index"] = mgr.axes[-1] + if self.ndim == 2: + axes_dict["columns"] = mgr.axes[0] + return self._constructor(mgr, **axes_dict).__finalize__(self) @final def _get_bool_data(self): - return self._constructor(self._mgr.get_bool_data()).__finalize__(self) + # FIXME: get axes without mgr.axes + mgr = self._mgr.get_bool_data() + axes_dict = {} + axes_dict["index"] = mgr.axes[-1] + if self.ndim == 2: + axes_dict["columns"] = mgr.axes[0] + return self._constructor(mgr, **axes_dict).__finalize__(self) # ---------------------------------------------------------------------- # Internal Interface Methods @@ -6264,7 +6336,8 @@ def astype( else: # else, only a single dtype is given new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) - return self._constructor(new_data).__finalize__(self, method="astype") + axes_dict = self._construct_axes_dict() + return self._constructor(new_data, **axes_dict).__finalize__(self, method="astype") # GH 33113: handle empty frame or series if not results: @@ -6393,7 +6466,8 @@ def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT: """ data = self._mgr.copy(deep=deep) self._clear_item_cache() - return self._constructor(data).__finalize__(self, method="copy") + axes_dict = self._construct_axes_dict() + return self._constructor(data, **axes_dict).__finalize__(self, method="copy") @final def __copy__(self: NDFrameT, deep: bool_t = True) -> NDFrameT: @@ -6436,13 +6510,15 @@ def _convert( validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") + axes_dict = self._construct_axes_dict() return self._constructor( self._mgr.convert( datetime=datetime, numeric=numeric, timedelta=timedelta, copy=True, - ) + ), + **axes_dict ).__finalize__(self) @final @@ -6954,7 +7030,8 @@ def fillna( else: raise ValueError(f"invalid fill value with a {type(value)}") - result = self._constructor(new_data) + axes_dict = self._construct_axes_dict() + result = self._constructor(new_data, **axes_dict) if inplace: return self._update_inplace(result) else: @@ -9615,7 +9692,12 @@ def _align_series( if copy and fdata is self._mgr: fdata = fdata.copy() - left = self._constructor(fdata) + # FIXME: get axes without mgr.axes + if self.ndim == 1: + axes_dict = {"index": fdata.axes[0]} + else: + axes_dict = {"index": fdata.axes[1], "columns": fdata.axes[0]} + left = self._constructor(fdata, **axes_dict) if ridx is None: right = other @@ -9757,7 +9839,8 @@ def _where( self._check_inplace_setting(other) new_data = self._mgr.putmask(mask=cond, new=other, align=align) - result = self._constructor(new_data) + axes_dict = self._construct_axes_dict() + result = self._constructor(new_data, **axes_dict) return self._update_inplace(result) else: @@ -9766,7 +9849,8 @@ def _where( cond=cond, align=align, ) - result = self._constructor(new_data) + axes_dict = self._construct_axes_dict() + result = self._constructor(new_data, **axes_dict) return result.__finalize__(self) @overload @@ -11245,8 +11329,8 @@ def block_accum_func(blk_values): return result result = self._mgr.apply(block_accum_func) - - return self._constructor(result).__finalize__(self, method=name) + axes_dict = self._construct_axes_dict() + return self._constructor(result, **axes_dict).__finalize__(self, method=name) def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): return self._accum_func( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 33f3ffa34489e..800eb94d574e0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -175,7 +175,9 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series: else: mgr = cast(Manager2D, mgr) single = mgr.iget(0) - ser = self.obj._constructor(single, name=self.obj.name) + # FIXME: get axes without mgr.axes + index = single.axes[0] + ser = self.obj._constructor(single, index=index, name=self.obj.name) # NB: caller is responsible for setting ser.index return ser @@ -1654,14 +1656,16 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: rows = mgr.shape[1] if mgr.shape[0] > 0 else 0 index = Index(range(rows)) mgr.set_axis(1, index) - result = self.obj._constructor(mgr) + # FIXME: get axes without mgr.axes + result = self.obj._constructor(mgr, index=mgr.axes[1], columns=mgr.axes[0]) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index mgr.set_axis(1, index) - result = self.obj._constructor(mgr) + # FIXME: get axes without mgr.axes + result = self.obj._constructor(mgr, index=mgr.axes[1], columns=mgr.axes[0]) if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b963b85b93a31..63fce060ab887 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3864,7 +3864,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: if is_ser: out = self._wrap_agged_manager(res_mgr) else: - out = obj._constructor(res_mgr) + # FIXME: get axes without mgr.axes + out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0]) return self._wrap_aggregated_output(out) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ba808e1f2e07f..46440f59adcfc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1348,7 +1348,8 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - ser = sdata._constructor(mgr, name=sdata.name, fastpath=True) + index = sdata.index[slice_obj] + ser = sdata._constructor(mgr, index=index, name=sdata.name, fastpath=True) return ser.__finalize__(sdata, method="groupby") @@ -1360,7 +1361,13 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - df = sdata._constructor(mgr) + if self.axis == 0: + index = sdata.index[slice_obj] + columns = sdata.columns + else: + index = sdata.index + columns = sdata.columns[slice_obj] + df = sdata._constructor(mgr, index=index, columns=columns) return df.__finalize__(sdata, method="groupby") diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 53f8486074ef9..0fda74149fdf4 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1180,6 +1180,9 @@ def as_array( return result + def __len__(self) -> int: + return len(self.arrays) + class SingleArrayManager(BaseArrayManager, SingleDataManager): diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index ddc4495318568..c695f2c1e6ff1 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -45,7 +45,7 @@ def items(self) -> Index: @final def __len__(self) -> int: - return len(self.items) + raise AbstractMethodError(self) @property def ndim(self) -> int: @@ -160,6 +160,9 @@ class SingleDataManager(DataManager): def ndim(self) -> Literal[1]: return 1 + def __len__(self) -> int: + return len(self.arrays[0]) + @final @property def array(self) -> ArrayLike: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6aad8dbd940d4..708adc6201fe1 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -107,7 +107,7 @@ def arrays_to_mgr( verify_integrity: bool = True, typ: str | None = None, consolidate: bool = True, -) -> Manager: +) -> tuple[Manager, Index, Index]: """ Segregate Series based on type and coerce into matrices. @@ -152,13 +152,14 @@ def arrays_to_mgr( axes = [columns, index] if typ == "block": - return create_block_manager_from_column_arrays( + mgr = create_block_manager_from_column_arrays( arrays, axes, consolidate=consolidate ) elif typ == "array": - return ArrayManager(arrays, [index, columns]) + mgr = ArrayManager(arrays, [index, columns]) else: raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") + return mgr, index, columns def rec_array_to_mgr( @@ -204,7 +205,7 @@ def rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ) + mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)[0] if copy: mgr = mgr.copy() @@ -242,7 +243,7 @@ def mgr_to_mgr(mgr, typ: str, copy: bool = True): new_mgr = mgr else: if mgr.ndim == 2: - new_mgr = arrays_to_mgr( + new_mgr, _, _ = arrays_to_mgr( mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" ) else: @@ -314,7 +315,7 @@ def ndarray_to_mgr( else: columns = ensure_index(columns) - return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) + return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)[0] elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ, PeriodDtype @@ -491,7 +492,7 @@ def dict_to_mgr( # dtype check to exclude e.g. range objects, scalars arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] - return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) + return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)[0] def nested_data_to_arrays( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 61037a46f4f92..f1fd5aed7dcf4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1856,6 +1856,10 @@ def _consolidate_inplace(self) -> None: self._known_consolidated = True self._rebuild_blknos_and_blklocs() + def __len__(self) -> int: + # TODO: cache? would need to invalidate akin to blklocs + return sum(x.shape[1] for x in self.blocks) + class SingleBlockManager(BaseBlockManager, SingleDataManager): """manage a single block with""" diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 3d9e4f0c69c62..ac60aaf591c27 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -621,7 +621,7 @@ def get_result(self): new_data._consolidate_inplace() cons = sample._constructor - return cons(new_data).__finalize__(self, method="concat") + return cons(new_data, index=self.new_axes[1], columns=self.new_axes[0]).__finalize__(self, method="concat") def _get_result_dim(self) -> int: if self._is_series and self.bm_axis == 1: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 524b26ff07769..3aae45e19bddf 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -746,7 +746,7 @@ def _reindex_and_concat( allow_dups=True, use_na_proxy=True, ) - left = left._constructor(lmgr) + left = left._constructor(lmgr, index=join_index, columns=left.columns) left.index = join_index if right_indexer is not None: @@ -759,7 +759,7 @@ def _reindex_and_concat( allow_dups=True, use_na_proxy=True, ) - right = right._constructor(rmgr) + right = right._constructor(rmgr, index=join_index, columns=right.columns) right.index = join_index from pandas import concat diff --git a/pandas/core/series.py b/pandas/core/series.py index fc97a8f04e0cc..022004fa469c6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -318,7 +318,7 @@ class Series(base.IndexOpsMixin, NDFrame): _name: Hashable _metadata: list[str] = ["name"] - _internal_names_set = {"index"} | NDFrame._internal_names_set + _internal_names_set = {"_index", "index"} | NDFrame._internal_names_set _accessors = {"dt", "cat", "str", "sparse"} _hidden_attrs = ( base.IndexOpsMixin._hidden_attrs @@ -351,6 +351,12 @@ def __init__( fastpath: bool = False, ) -> None: + if isinstance(data, (SingleBlockManager, SingleArrayManager)): + if index is None: + assert False + if not index.equals(data.axes[0]):#index is not data.axes[0]: + assert False + if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None @@ -592,7 +598,8 @@ def _set_axis(self, axis: int, labels: AnyArrayLike | list) -> None: pass # The ensure_index call above ensures we have an Index object - self._mgr.set_axis(axis, labels) + self._validate_set_axis(0, labels) + object.__setattr__(self, "_index", labels) # ndarray compatibility @property @@ -1071,7 +1078,8 @@ def _get_values_tuple(self, key: tuple): def _get_values(self, indexer: slice | npt.NDArray[np.bool_]) -> Series: new_mgr = self._mgr.getitem_mgr(indexer) - return self._constructor(new_mgr).__finalize__(self) + new_index = self.index[indexer] + return self._constructor(new_mgr, index=new_index).__finalize__(self) def _get_value(self, label, takeable: bool = False): """ @@ -1946,7 +1954,7 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: columns = Index([name]) mgr = self._mgr.to_2d_mgr(columns) - df = self._constructor_expanddim(mgr) + df = self._constructor_expanddim(mgr, index=self.index, columns=columns) return df.__finalize__(self, method="to_frame") def _set_name(self, name, inplace=False) -> Series: From 87eafb4c783ffcd5ba9f148f182c725586384834 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Aug 2022 14:35:56 -0700 Subject: [PATCH 2/4] fix some tests --- pandas/core/apply.py | 3 +- pandas/core/arraylike.py | 6 +--- pandas/core/frame.py | 63 ++++++++++++++++++++++------------ pandas/core/generic.py | 58 +++++++++++++++---------------- pandas/core/groupby/generic.py | 9 ++--- pandas/core/groupby/groupby.py | 3 +- pandas/core/indexing.py | 14 ++++++++ pandas/core/series.py | 7 ++-- 8 files changed, 98 insertions(+), 65 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 48822d9d01ddb..372d932629259 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -726,7 +726,8 @@ def apply(self) -> DataFrame | Series: with np.errstate(all="ignore"): results = self.obj._mgr.apply("apply", func=self.f) # _constructor will retain self.index and self.columns - return self.obj._constructor(data=results) + axes_dict = self.obj._construct_axes_dict() + return self.obj._constructor(data=results, **axes_dict) # broadcasting if self.result_type == "broadcast": diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 996bfd776f981..871eb187802f0 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -359,11 +359,7 @@ def _reconstruct(result): if isinstance(result, BlockManager): # we went through BlockManager.apply e.g. np.sqrt # TODO: any cases that aren't index/columns-preserving? - if self.ndim == 1: - reconstruct_kwargs["index"] = self.index - else: - reconstruct_kwargs["index"] = self.index - reconstruct_kwargs["columns"] = self.columns + reconstruct_kwargs.update(self._construct_axes_dict()) result = self._constructor(result, **reconstruct_kwargs, copy=False) else: # we converted an array, lost our axes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d8074ca18a8eb..b3c45930ae71f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -621,8 +621,9 @@ def __init__( dtype = self._validate_dtype(dtype) if isinstance(data, DataFrame): - if index is None and columns is None: + if index is None: index = data.index + if columns is None: columns = data.columns data = data._mgr @@ -631,10 +632,14 @@ def __init__( # -> use fastpath (without checking Manager type) if index is None or columns is None: assert False - if not index.equals(data.axes[-1]):#index is not data.axes[-1]: - assert False - if not columns.equals(data.axes[0]):#columns is not data.axes[0]: - assert False + if data.axes[0] is not columns or data.axes[1] is not index: + # FIXME: without this check, json tests segfault... + # nope, segfaults even with this check + data.axes = [ensure_index(columns), ensure_index(index)] + #if not index.equals(data.axes[-1]):#index is not data.axes[-1]: + # assert False + #if not columns.equals(data.axes[0]):#columns is not data.axes[0]: + # assert False if index is None and columns is None and dtype is None and not copy: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -2410,7 +2415,6 @@ def maybe_reorder( manager = get_option("mode.data_manager") mgr, index, columns = arrays_to_mgr(arrays, columns, result_index, typ=manager) - # FIXME: get axes without mgr.axes return cls(mgr, index=index, columns=columns) def to_records( @@ -4164,6 +4168,7 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None: except KeyError: # This item wasn't present, just insert at end self._mgr.insert(len(self._info_axis), key, value) + self._columns = self.columns.insert(len(self._info_axis), key) else: self._iset_item_mgr(loc, value) @@ -4765,7 +4770,9 @@ def predicate(arr: ArrayLike) -> bool: return True mgr = self._mgr._get_data_subset(predicate).copy(deep=None) - return type(self)(mgr).__finalize__(self) + # FIXME: get axes without mgr.axes + assert mgr.axes[1] is self.index # WTF why does passing columns/index cause segfault? + return type(self)(mgr, columns=mgr.axes[0], index=mgr.axes[1]).__finalize__(self) def insert( self, @@ -5865,7 +5872,7 @@ def shift( fill_value=fill_value, allow_dups=True, ) - res_df = self._constructor(mgr) + res_df = self._constructor(mgr, columns=self.columns, index=self.index) return res_df.__finalize__(self, method="shift") return super().shift( @@ -6392,7 +6399,8 @@ class max type @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> DataFrame: - result = self._constructor(self._mgr.isna(func=isna)) + axes_dict = self._construct_axes_dict() + result = self._constructor(self._mgr.isna(func=isna), **axes_dict) return result.__finalize__(self, method="isna") @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) @@ -6944,19 +6952,26 @@ def sort_values( # type: ignore[override] else: return self.copy() + bm_axis = self._get_block_manager_axis(axis) + new_data = self._mgr.take( - indexer, axis=self._get_block_manager_axis(axis), verify=False + indexer, axis=bm_axis, verify=False ) - if ignore_index: - new_data.set_axis( - self._get_block_manager_axis(axis), default_index(len(indexer)) - ) - # FIXME: get axes without mgr.axes + axis_name = self._get_axis_name(axis) + axes_dict = {} - axes_dict["index"] = new_data.axes[-1] - if self.ndim == 2: - axes_dict["columns"] = new_data.axes[0] + axes_dict[axis_name] = self.axes[axis].take(indexer) + if axis == 0: + axes_dict["columns"] = self.columns + else: + axes_dict["index"] = self.index + + if ignore_index: + rng = default_index(len(indexer)) + new_data.set_axis(bm_axis, rng) + axes_dict[axis_name] = rng + result = self._constructor(new_data, **axes_dict) if inplace: return self._update_inplace(result) @@ -10913,9 +10928,12 @@ def _get_data() -> DataFrame: # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce - res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) - # FIXME: get axes without mgr.axes - out = df._constructor(res, index=res.axes[1], columns=res.axes[0]).iloc[0] + res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) + index = Index([None], dtype=object) + assert index.equals(res.axes[1]) + columns = self.columns.take(indexer) + assert columns.equals(res.axes[0]) + out = df._constructor(res, index=index, columns=columns).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: @@ -11413,7 +11431,8 @@ def quantile( res = data._mgr.take(indexer[q_idx], verify=False) res.axes[1] = q - result = self._constructor(res) + # FIXME: get axes without mgr.axes + result = self._constructor(res, columns=res.axes[0], index=res.axes[1]) return result.__finalize__(self, method="quantile") @doc(NDFrame.asfreq, **_shared_doc_kwargs) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f92c297c77293..d84013e35d2d8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3924,7 +3924,7 @@ def _take( convert_indices=convert_indices, ) axes_dict = self._construct_axes_dict() - #axes_dict[axis] = self.axes[axis].take(indices) # FIXME: get axes without mgr.axes + # FIXME: get axes without mgr.axes axes_dict[self._get_axis_name(axis)] = new_data.axes[self._get_block_manager_axis(axis)] return self._constructor(new_data, **axes_dict).__finalize__(self, method="take") @@ -4113,7 +4113,7 @@ class animal locomotion new_mgr = self._mgr.fast_xs(loc) result = self._constructor_sliced( - new_mgr, name=self.index[loc] + new_mgr, name=self.index[loc], index=self.columns ).__finalize__(self) elif is_scalar(loc): result = self.iloc[:, slice(loc, loc + 1)] @@ -4156,6 +4156,14 @@ def _slice(self: NDFrameT, slobj: slice, axis=0) -> NDFrameT: result._set_is_copy(self, copy=is_copy) return result + @staticmethod + def _get_axes_from_mgr(mgr): + axes_dict = {} + axes_dict["index"] = mgr.axes[-1] + if mgr.ndim == 2: + axes_dict["columns"] = mgr.axes[0] + return axes_dict + @final def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None: if not copy: @@ -4291,6 +4299,11 @@ def __delitem__(self, key) -> None: # exception: loc = self.axes[-1].get_loc(key) self._mgr = self._mgr.idelete(loc) + # FIXME: get axes without mgr.axes + if self.ndim == 1: + self._index = self._mgr.axes[0] + else: + self._columns = self._mgr.axes[0] # delete from the caches try: @@ -4639,10 +4652,7 @@ def _drop_axis( only_slice=only_slice, ) # FIXME: get axes without mgr.axes - axes_dict = {} - axes_dict["index"] = new_mgr.axes[-1] - if self.ndim == 2: - axes_dict["columns"] = new_mgr.axes[0] + axes_dict = self._get_axes_from_mgr(new_mgr) result = self._constructor(new_mgr, **axes_dict) if self.ndim == 1: result.name = self.name @@ -5104,11 +5114,8 @@ def sort_index( axis = 1 if isinstance(self, ABCDataFrame) else 0 new_data.set_axis(axis, default_index(len(indexer))) - axes_dict = {}#self._construct_axes_dict() # FIXME: get axes without mgr.axes - axes_dict["index"] = new_data.axes[-1] - if self.ndim == 2: - axes_dict["columns"] = new_data.axes[0] + axes_dict = self._get_axes_from_mgr(new_data) result = self._constructor(new_data, **axes_dict) if inplace: @@ -5447,10 +5454,7 @@ def _reindex_with_indexers( new_data = new_data.copy() # FIXME: get axes without mgr.axes - if self.ndim == 1: - axes_dict = {"index": new_data.axes[0]} - else: - axes_dict = {"index": new_data.axes[1], "columns": new_data.axes[0]} + axes_dict = self._get_axes_from_mgr(new_data) return self._constructor(new_data, **axes_dict).__finalize__(self) @@ -6110,22 +6114,16 @@ def _check_inplace_setting(self, value) -> bool_t: @final def _get_numeric_data(self: NDFrameT) -> NDFrameT: - # FIXME: get axes without mgr.axes mgr = self._mgr.get_numeric_data() - axes_dict = {} - axes_dict["index"] = mgr.axes[-1] - if self.ndim == 2: - axes_dict["columns"] = mgr.axes[0] + # FIXME: get axes without mgr.axes + axes_dict = self._get_axes_from_mgr(mgr) return self._constructor(mgr, **axes_dict).__finalize__(self) @final def _get_bool_data(self): - # FIXME: get axes without mgr.axes mgr = self._mgr.get_bool_data() - axes_dict = {} - axes_dict["index"] = mgr.axes[-1] - if self.ndim == 2: - axes_dict["columns"] = mgr.axes[0] + # FIXME: get axes without mgr.axes + axes_dict = self._get_axes_from_mgr(mgr) return self._constructor(mgr, **axes_dict).__finalize__(self) # ---------------------------------------------------------------------- @@ -6563,8 +6561,10 @@ def infer_objects(self: NDFrameT) -> NDFrameT: # numeric=False necessary to only soft convert; # python objects will still be converted to # native numpy numeric types + axes_dict = self._construct_axes_dict() return self._constructor( - self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True) + self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True), + **axes_dict, ).__finalize__(self, method="infer_objects") @final @@ -9693,10 +9693,7 @@ def _align_series( fdata = fdata.copy() # FIXME: get axes without mgr.axes - if self.ndim == 1: - axes_dict = {"index": fdata.axes[0]} - else: - axes_dict = {"index": fdata.axes[1], "columns": fdata.axes[0]} + axes_dict = self._get_axes_from_mgr(fdata) left = self._constructor(fdata, **axes_dict) if ridx is None: @@ -10282,7 +10279,8 @@ def shift( new_data = self._mgr.shift( periods=periods, axis=axis, fill_value=fill_value ) - return self._constructor(new_data).__finalize__(self, method="shift") + axes_dict = self._construct_axes_dict() + return self._constructor(new_data, **axes_dict).__finalize__(self, method="shift") # when freq is given, index is shifted, data is not index = self._get_axis(axis) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 800eb94d574e0..6664a91eda093 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1160,7 +1160,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)): result[i] = sgb.aggregate(func, *args, **kwargs) - res_df = self.obj._constructor(result) + res_df = self.obj._constructor(result, columns=obj.columns) res_df.columns = obj.columns return res_df @@ -1335,7 +1335,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: if len(res_mgr) < orig_mgr_len: warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) - res_df = self.obj._constructor(res_mgr) + # FIXME: get axes without mgr.axes + res_df = self.obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0]) if self.axis == 1: res_df = res_df.T return res_df @@ -1657,7 +1658,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: index = Index(range(rows)) mgr.set_axis(1, index) # FIXME: get axes without mgr.axes - result = self.obj._constructor(mgr, index=mgr.axes[1], columns=mgr.axes[0]) + result = self.obj._constructor(mgr, index=index, columns=mgr.axes[0]) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() @@ -1665,7 +1666,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: index = self.grouper.result_index mgr.set_axis(1, index) # FIXME: get axes without mgr.axes - result = self.obj._constructor(mgr, index=mgr.axes[1], columns=mgr.axes[0]) + result = self.obj._constructor(mgr, index=index, columns=mgr.axes[0]) if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 63fce060ab887..4765f0e6bd8c2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2906,7 +2906,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: mgr = obj._mgr res_mgr = mgr.apply(blk_func) - new_obj = obj._constructor(res_mgr) + axes_dict = obj._construct_axes_dict() + new_obj = obj._constructor(res_mgr, **axes_dict) if isinstance(new_obj, Series): new_obj.name = obj.name diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d415cbd035cd1..ed7307debbd19 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -803,6 +803,9 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: keys = self.obj.columns.union(key, sort=False) self.obj._mgr = self.obj._mgr.reindex_axis(keys, axis=0, only_slice=True) + assert self.obj._mgr.axes[0].equals(keys) + self.obj._columns = Index(keys) + @final def __setitem__(self, key, value) -> None: @@ -1765,8 +1768,13 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): reindexers, allow_dups=True ) self.obj._mgr = new_obj._mgr + self.obj._index = self.obj._mgr.axes[-1] + if self.ndim == 2: + # FIXME: get axes without mgr.axes + self.obj._columns = self.obj._mgr.axes[0] self.obj._maybe_update_cacher(clear=True) self.obj._is_copy = None + nindexer.append(labels.get_loc(key)) @@ -1988,6 +1996,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # falling back to casting if necessary) self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() + return # We will not operate in-place, but will attempt to in the future. @@ -2078,6 +2087,7 @@ def _setitem_single_block(self, indexer, value, name: str) -> None: # actually do the set self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) self.obj._maybe_update_cacher(clear=True, inplace=True) + def _setitem_with_indexer_missing(self, indexer, value): """ @@ -2129,8 +2139,10 @@ def _setitem_with_indexer_missing(self, indexer, value): self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name )._mgr + self.obj._index = new_index self.obj._maybe_update_cacher(clear=True) + elif self.ndim == 2: if not len(self.obj.columns): @@ -2172,8 +2184,10 @@ def _setitem_with_indexer_missing(self, indexer, value): # dtype. But if we had a list or dict, then do inference df = df.infer_objects() self.obj._mgr = df._mgr + else: self.obj._mgr = self.obj._append(value)._mgr + self.obj._maybe_update_cacher(clear=True) def _ensure_iterable_column_indexer(self, column_indexer): diff --git a/pandas/core/series.py b/pandas/core/series.py index 022004fa469c6..f1ed825dd4f2c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -354,8 +354,11 @@ def __init__( if isinstance(data, (SingleBlockManager, SingleArrayManager)): if index is None: assert False - if not index.equals(data.axes[0]):#index is not data.axes[0]: - assert False + if data.axes[0] is not index: + # Adding check to try to avoid segfualt in json tests + data.axes = [ensure_index(index)] + #if not index.equals(data.axes[0]):#index is not data.axes[0]: + # assert False if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) From 7f73a89570f9775546f6376e5ff5d31f4933a7c2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Aug 2022 10:35:20 -0700 Subject: [PATCH 3/4] down to about 100 failing tests --- pandas/core/apply.py | 4 +- pandas/core/frame.py | 48 ++++++++--- pandas/core/generic.py | 95 +++++++++++++++------ pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 13 ++- pandas/core/indexes/base.py | 2 + pandas/core/indexing.py | 20 +++-- pandas/core/internals/array_manager.py | 1 + pandas/core/internals/base.py | 4 +- pandas/core/internals/managers.py | 39 +++++---- pandas/core/resample.py | 2 +- pandas/core/reshape/reshape.py | 6 +- pandas/core/series.py | 1 + pandas/tests/internals/test_internals.py | 10 +-- pandas/tests/series/methods/test_reindex.py | 1 + 15 files changed, 169 insertions(+), 79 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 372d932629259..3faac6858e7af 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1002,6 +1002,7 @@ def series_generator(self): # We create one Series object, and will swap out the data inside # of it. Kids: don't do this at home. ser = self.obj._ixs(0, axis=0) + index = ser.index mgr = ser._mgr if is_extension_array_dtype(ser.dtype): @@ -1013,9 +1014,10 @@ def series_generator(self): else: for (arr, name) in zip(values, self.index): - # GH#35462 re-pin mgr in case setitem changed it + # GH#35462 re-pin mgr, index in case setitem changed it ser._mgr = mgr mgr.set_values(arr) + ser._index = index object.__setattr__(ser, "_name", name) yield ser diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b3c45930ae71f..76b01b069d2dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4272,6 +4272,7 @@ def _ensure_valid_index(self, value) -> None: index_copy.name = self.index.name self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) + self._index = index_copy def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: """ @@ -4501,6 +4502,8 @@ def query(self, expr: str, inplace: bool = False, **kwargs) -> DataFrame | None: if inplace: self._update_inplace(result) + self._index = result._index + self._columns = result._columns return None else: return result @@ -4757,8 +4760,7 @@ def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: and not is_bool_dtype(dtype) ) - def predicate(arr: ArrayLike) -> bool: - dtype = arr.dtype + def predicate(dtype: DtypeObj) -> bool: if include: if not dtype_predicate(dtype, include): return False @@ -4769,10 +4771,16 @@ def predicate(arr: ArrayLike) -> bool: return True - mgr = self._mgr._get_data_subset(predicate).copy(deep=None) + def arr_predicate(arr: ArrayLike) -> bool: + dtype = arr.dtype + return predicate(dtype) + + mgr, taker = self._mgr._get_data_subset(arr_predicate).copy(deep=None) # FIXME: get axes without mgr.axes - assert mgr.axes[1] is self.index # WTF why does passing columns/index cause segfault? - return type(self)(mgr, columns=mgr.axes[0], index=mgr.axes[1]).__finalize__(self) + # FIXME: return taker from _get_data_subset, this is really slow + #taker = self.dtypes.apply(predicate).values.nonzero()[0] + columns = self.columns.take(taker) + return type(self)(mgr, columns=columns, index=self.index).__finalize__(self) def insert( self, @@ -4841,6 +4849,7 @@ def insert( value = self._sanitize_column(value) self._mgr.insert(loc, column, value) + self._columns = self.columns.insert(loc, column) def assign(self, **kwargs) -> DataFrame: r""" @@ -6605,6 +6614,8 @@ def dropna( if not inplace: return result self._update_inplace(result) + self._columns = result._columns + self._index = result._index return None @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) @@ -6703,6 +6714,8 @@ def drop_duplicates( if inplace: self._update_inplace(result) + self._index = result._index + self._columns = result._columns return None else: return result @@ -9268,7 +9281,7 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: axis = 0 new_data = self._mgr.diff(n=periods, axis=axis) - return self._constructor(new_data).__finalize__(self, "diff") + return self._constructor(new_data, index=self.index, columns=self.columns).__finalize__(self, "diff") # ---------------------------------------------------------------------- # Function application @@ -10879,8 +10892,9 @@ def _reduce( # cols = self.columns[~dt64_cols] # self = self[cols] predicate = lambda x: not is_datetime64_any_dtype(x.dtype) - mgr = self._mgr._get_data_subset(predicate) - self = type(self)(mgr) + mgr, taker = self._mgr._get_data_subset(predicate) + columns = self.columns[taker] + self = type(self)(mgr, index=self.index, columns=columns) # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) @@ -10928,11 +10942,20 @@ def _get_data() -> DataFrame: # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce - res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) + res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) index = Index([None], dtype=object) assert index.equals(res.axes[1]) - columns = self.columns.take(indexer) - assert columns.equals(res.axes[0]) + if ignore_failures: + if len(res.items) == len(df.columns): + # i.e. nothing was dropped + columns = df.columns + else: + # FIXME: get axes without mgr.axes; THIS IS WRONG TOO + columns = res.axes[0] + else: + columns = df.columns + assert columns.equals(res.axes[0]) + out = df._constructor(res, index=index, columns=columns).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) @@ -11736,8 +11759,9 @@ def _to_dict_of_blocks(self, copy: bool = True): # convert to BlockManager if needed -> this way support ArrayManager as well mgr = mgr_to_mgr(mgr, "block") mgr = cast(BlockManager, mgr) + # FIXME: get axes without mgr.axes return { - k: self._constructor(v).__finalize__(self) + k: self._constructor(v, index=self.index, columns=v.axes[0]).__finalize__(self) for k, v, in mgr.to_dict(copy=copy).items() } diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d84013e35d2d8..a4b65b4e91149 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -32,7 +32,7 @@ import numpy as np from pandas._config import config - +from pandas.core.indexers import maybe_convert_indices from pandas._libs import lib from pandas._libs.tslibs import ( Period, @@ -1503,6 +1503,10 @@ def equals(self, other: object) -> bool_t: if not (isinstance(other, type(self)) or isinstance(self, type(other))): return False other = cast(NDFrame, other) + if self.ndim != other.ndim: + return False + if not all(left.equals(right) for left, right in zip(self.axes, other.axes)): + return False return self._mgr.equals(other._mgr) # ------------------------------------------------------------------------- @@ -2150,6 +2154,11 @@ def __array_ufunc__( @final def __getstate__(self) -> dict[str, Any]: meta = {k: getattr(self, k, None) for k in self._metadata} + + # TODO: handle unpickling older pickles where index/columns are in mgr + meta["_index"] = self.index + if self.ndim == 2: + meta["_columns"] = self.columns return { "_mgr": self._mgr, "_typ": self._typ, @@ -3923,9 +3932,13 @@ def _take( verify=True, convert_indices=convert_indices, ) + axes_dict = self._construct_axes_dict() - # FIXME: get axes without mgr.axes - axes_dict[self._get_axis_name(axis)] = new_data.axes[self._get_block_manager_axis(axis)] + if convert_indices and isinstance(indices, np.ndarray): + # i.e. exclude slice, which in principle shouldn't be in a _take + indices = maybe_convert_indices(indices, len(self.axes[axis]), verify=True) + axes_dict[self._get_axis_name(axis)] = self.axes[axis].take(indices)#[indices] + return self._constructor(new_data, **axes_dict).__finalize__(self, method="take") def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT: @@ -4297,13 +4310,19 @@ def __delitem__(self, key) -> None: # If the above loop ran and didn't delete anything because # there was no match, this call should raise the appropriate # exception: + + # make sure we access self.shape before calling mgr.idelete + is_deleted = np.zeros(self.shape[-1], dtype=np.bool_) + loc = self.axes[-1].get_loc(key) self._mgr = self._mgr.idelete(loc) - # FIXME: get axes without mgr.axes + + is_deleted[loc] = True + new_items = self.axes[-1][~is_deleted] if self.ndim == 1: - self._index = self._mgr.axes[0] + self._index = new_items else: - self._columns = self._mgr.axes[0] + self._columns = new_items # delete from the caches try: @@ -4568,6 +4587,9 @@ def drop( if inplace: self._update_inplace(obj) + self._index = obj._index + if self.ndim > 1: + self._columns = obj._columns else: return obj @@ -4643,6 +4665,8 @@ def _drop_axis( indexer = mask.nonzero()[0] new_axis = axis.take(indexer) + axes_dict = self._construct_axes_dict() + axes_dict[self._get_axis_name(axis_num)] = new_axis bm_axis = self.ndim - axis_num - 1 new_mgr = self._mgr.reindex_indexer( new_axis, @@ -4651,13 +4675,12 @@ def _drop_axis( allow_dups=True, only_slice=only_slice, ) - # FIXME: get axes without mgr.axes - axes_dict = self._get_axes_from_mgr(new_mgr) result = self._constructor(new_mgr, **axes_dict) if self.ndim == 1: result.name = self.name - return result.__finalize__(self) + out = result.__finalize__(self) + return out @final def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: @@ -4675,6 +4698,9 @@ def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: self._reset_cache() self._clear_item_cache() self._mgr = result._mgr + self._index = result._index + if self.ndim == 2: + self._columns = result._columns self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True) @final @@ -5083,6 +5109,7 @@ def sort_index( inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) + orig_axis = axis ascending = validate_ascending(ascending) target = self._get_axis(axis) @@ -5107,19 +5134,25 @@ def sort_index( baxis = self._get_block_manager_axis(axis) new_data = self._mgr.take(indexer, axis=baxis, verify=False) + axis_name = self._get_axis_name(axis) + axes_dict = self._construct_axes_dict() + axes_dict[axis_name] = self.axes[axis].take(indexer)._sort_levels_monotonic() + # reconstruct axis if needed - new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic()) + new_data.set_axis(baxis, axes_dict[axis_name]) if ignore_index: axis = 1 if isinstance(self, ABCDataFrame) else 0 - new_data.set_axis(axis, default_index(len(indexer))) + rng = default_index(len(indexer)) + new_data.set_axis(axis, rng) + + name = "columns" if orig_axis == 1 else "index" + axes_dict[name] = rng - # FIXME: get axes without mgr.axes - axes_dict = self._get_axes_from_mgr(new_data) result = self._constructor(new_data, **axes_dict) if inplace: - return self._update_inplace(result) + self._update_inplace(result) else: return result.__finalize__(self, method="sort_index") @@ -5426,6 +5459,9 @@ def _reindex_with_indexers( ) -> NDFrameT: """allow_dups indicates an internal call here""" # reindex doing multiple operations on different axes if indicated + axes_dict = self._construct_axes_dict() + axes_dict = {x: axes_dict[x].copy(deep=False) for x in axes_dict} + new_data = self._mgr for axis in sorted(reindexers.keys()): index, indexer = reindexers[axis] @@ -5449,12 +5485,13 @@ def _reindex_with_indexers( ) # If we've made a copy once, no need to make another one copy = False + axes_dict[self._get_axis_name(axis)] = index if copy and new_data is self._mgr: new_data = new_data.copy() # FIXME: get axes without mgr.axes - axes_dict = self._get_axes_from_mgr(new_data) + #axes_dict = self._get_axes_from_mgr(new_data) return self._constructor(new_data, **axes_dict).__finalize__(self) @@ -6114,16 +6151,16 @@ def _check_inplace_setting(self, value) -> bool_t: @final def _get_numeric_data(self: NDFrameT) -> NDFrameT: - mgr = self._mgr.get_numeric_data() - # FIXME: get axes without mgr.axes - axes_dict = self._get_axes_from_mgr(mgr) + mgr, taker = self._mgr.get_numeric_data() + axes_dict = self._construct_axes_dict() + axes_dict[self._get_axis_name(self.ndim-1)] = self.axes[-1].take(taker) return self._constructor(mgr, **axes_dict).__finalize__(self) @final def _get_bool_data(self): - mgr = self._mgr.get_bool_data() - # FIXME: get axes without mgr.axes - axes_dict = self._get_axes_from_mgr(mgr) + mgr, taker = self._mgr.get_bool_data() + axes_dict = self._construct_axes_dict() + axes_dict[self._get_axis_name(self.ndim-1)] = self.axes[-1].take(taker) return self._constructor(mgr, **axes_dict).__finalize__(self) # ---------------------------------------------------------------------- @@ -6465,6 +6502,8 @@ def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT: data = self._mgr.copy(deep=deep) self._clear_item_cache() axes_dict = self._construct_axes_dict() + # TODO: probably need to do this copy elsewhere? + axes_dict = {x: axes_dict[x].copy(deep=False) for x in axes_dict} return self._constructor(data, **axes_dict).__finalize__(self, method="copy") @final @@ -7388,7 +7427,8 @@ def replace( f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' ) - result = self._constructor(new_data) + axes_dict = self._construct_axes_dict() + result = self._constructor(new_data, **axes_dict) if inplace: return self._update_inplace(result) else: @@ -7679,8 +7719,8 @@ def interpolate( downcast=downcast, **kwargs, ) - - result = self._constructor(new_data) + axes_dict = obj._construct_axes_dict() + result = self._constructor(new_data, **axes_dict) if should_transpose: result = result.T if inplace: @@ -9668,8 +9708,9 @@ def _align_series( elif lidx is None or join_index is None: left = self.copy() if copy else self else: + new_mgr = self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy) left = self._constructor( - self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy) + new_mgr, columns=self.columns, index=join_index ) right = other._reindex_indexer(join_index, ridx, copy) @@ -9692,8 +9733,8 @@ def _align_series( if copy and fdata is self._mgr: fdata = fdata.copy() - # FIXME: get axes without mgr.axes - axes_dict = self._get_axes_from_mgr(fdata) + axes_dict = self._construct_axes_dict() + axes_dict["columns"] = join_index left = self._constructor(fdata, **axes_dict) if ridx is None: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6664a91eda093..128f2372f27d8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1320,7 +1320,7 @@ def _cython_transform( mgr: Manager2D = self._get_data_to_aggregate() orig_mgr_len = len(mgr) if numeric_only_bool: - mgr = mgr.get_numeric_data(copy=False) + mgr = mgr.get_numeric_data(copy=False)[0] def arr_func(bvalues: ArrayLike) -> ArrayLike: return self.grouper._cython_operation( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4765f0e6bd8c2..43312cd9f5554 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1757,7 +1757,7 @@ def _cython_agg_general( f"{type(self).__name__}.{how} does not implement {kwd_name}." ) elif not is_ser: - data = data.get_numeric_data(copy=False) + data = data.get_numeric_data(copy=False)[0] def array_func(values: ArrayLike) -> ArrayLike: try: @@ -3372,7 +3372,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: obj = self._obj_with_exclusions is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() - data = mgr.get_numeric_data() if numeric_only_bool else mgr + data = mgr.get_numeric_data()[0] if numeric_only_bool else mgr ignore_failures = numeric_only_bool res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) @@ -3396,7 +3396,12 @@ def blk_func(values: ArrayLike) -> ArrayLike: if is_ser: res = self._wrap_agged_manager(res_mgr) else: - res = obj._constructor(res_mgr) + # FIXME: get axes without mgr.axes + axes_dict = {} + axes_dict["index"] = res_mgr.axes[-1] + if res_mgr.ndim == 2: + axes_dict["columns"] = res_mgr.axes[0] + res = obj._constructor(res_mgr, **axes_dict) if orig_scalar: # Avoid expensive MultiIndex construction @@ -3846,7 +3851,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: orig_mgr_len = len(mgr) if numeric_only_bool: - mgr = mgr.get_numeric_data() + mgr = mgr.get_numeric_data()[0] res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 52150eafd7783..ef8a3f6aef191 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -828,7 +828,9 @@ def _view(self: _IndexT) -> _IndexT: result = self._simple_new(self._values, name=self._name) result._cache = self._cache + result._id = self._id return result + # TODO: preserve _id? @final def _rename(self: _IndexT, name: Hashable) -> _IndexT: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ed7307debbd19..780dc8695a87b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1587,6 +1587,7 @@ def _get_list_axis(self, key, axis: int): return self.obj._take_with_is_copy(key, axis=axis) except IndexError as err: # re-raise with different error message + raise # watch out for case with wrong dtype key? raise IndexError("positional indexers are out-of-bounds") from err def _getitem_axis(self, key, axis: int): @@ -1606,6 +1607,9 @@ def _getitem_axis(self, key, axis: int): if isinstance(key, list): key = np.asarray(key) + #if len(key) == 0: + # key = key.astype(np.intp) + # TODO: if empty, do intp instead of float64? if com.is_bool_indexer(key): self._validate_key(key, axis) @@ -1768,10 +1772,11 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): reindexers, allow_dups=True ) self.obj._mgr = new_obj._mgr - self.obj._index = self.obj._mgr.axes[-1] - if self.ndim == 2: - # FIXME: get axes without mgr.axes - self.obj._columns = self.obj._mgr.axes[0] + # TODO: use update_inplace? + if i == 0: + self.obj._index = labels + else: + self.obj._columns = labels self.obj._maybe_update_cacher(clear=True) self.obj._is_copy = None @@ -2086,8 +2091,8 @@ def _setitem_single_block(self, indexer, value, name: str) -> None: # actually do the set self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) + # FIXME: update axes? self.obj._maybe_update_cacher(clear=True, inplace=True) - def _setitem_with_indexer_missing(self, indexer, value): """ @@ -2184,9 +2189,12 @@ def _setitem_with_indexer_missing(self, indexer, value): # dtype. But if we had a list or dict, then do inference df = df.infer_objects() self.obj._mgr = df._mgr + self.obj._index = df.index else: - self.obj._mgr = self.obj._append(value)._mgr + new_obj = self.obj._append(value) + self.obj._mgr = new_obj._mgr + self.obj._index = new_obj.index self.obj._maybe_update_cacher(clear=True) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0fda74149fdf4..d9154aa8b212b 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -485,6 +485,7 @@ def get_bool_data(self: T, copy: bool = False) -> T: Whether to copy the blocks """ return self._get_data_subset(is_inferred_bool_dtype) + # FIXME: return indexer def get_numeric_data(self: T, copy: bool = False) -> T: """ diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index c695f2c1e6ff1..995f9423cb5cb 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -124,8 +124,8 @@ def equals(self, other: object) -> bool: self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): return False - if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): - return False + #if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + # return False return self._equal_values(other) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f1fd5aed7dcf4..e3758dc5574c1 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -353,7 +353,7 @@ def apply( result_blocks = extend_blocks(applied, result_blocks) if ignore_failures: - return self._combine(result_blocks) + return self._combine(result_blocks)[0] out = type(self).from_blocks(result_blocks, self.axes) return out @@ -524,11 +524,11 @@ def is_view(self) -> bool: return False - def _get_data_subset(self: T, predicate: Callable) -> T: + def _get_data_subset(self: T, predicate: Callable) -> tuple[T, npt.NDArray[np.intp]]: blocks = [blk for blk in self.blocks if predicate(blk.values)] return self._combine(blocks, copy=False) - def get_bool_data(self: T, copy: bool = False) -> T: + def get_bool_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]: """ Select blocks that are bool-dtype and columns from object-dtype blocks that are all-bool. @@ -553,7 +553,7 @@ def get_bool_data(self: T, copy: bool = False) -> T: return self._combine(new_blocks, copy) - def get_numeric_data(self: T, copy: bool = False) -> T: + def get_numeric_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]: """ Parameters ---------- @@ -563,24 +563,26 @@ def get_numeric_data(self: T, copy: bool = False) -> T: numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] if len(numeric_blocks) == len(self.blocks): # Avoid somewhat expensive _combine + taker = np.arange(len(self), dtype=np.intp) # TODO: return None to indicate no take needed? if copy: - return self.copy(deep=True) - return self + return self.copy(deep=True), taker + return self, taker return self._combine(numeric_blocks, copy) def _combine( self: T, blocks: list[Block], copy: bool = True, index: Index | None = None - ) -> T: + ) -> tuple[T, npt.NDArray[np.intp]]: """return a new manager with the blocks""" if len(blocks) == 0: + indexer = np.arange(0, dtype=np.intp) if self.ndim == 2: # retain our own Index dtype if index is not None: axes = [self.items[:0], index] else: axes = [self.items[:0]] + self.axes[1:] - return self.make_empty(axes) - return self.make_empty() + return self.make_empty(axes), indexer + return self.make_empty(), indexer # FIXME: optimization potential indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) @@ -604,7 +606,7 @@ def _combine( axes[-1] = index axes[0] = self.items.take(indexer) - return type(self).from_blocks(new_blocks, axes, new_refs) + return type(self).from_blocks(new_blocks, axes, new_refs), indexer @property def nblocks(self) -> int: @@ -1520,7 +1522,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: if dropped_any: # faster to skip _combine if we haven't dropped any blocks - return self._combine(result_blocks, copy=False, index=index) + return self._combine(result_blocks, copy=False, index=index)[0] return type(self).from_blocks(result_blocks, [self.axes[0], index]) @@ -1554,7 +1556,7 @@ def reduce( if ignore_failures: if res_blocks: indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks]) - new_mgr = self._combine(res_blocks, copy=False, index=index) + new_mgr = self._combine(res_blocks, copy=False, index=index)[0] else: indexer = [] new_mgr = type(self).from_blocks([], [self.items[:0], index]) @@ -1618,7 +1620,7 @@ def quantile( # ---------------------------------------------------------------- - def unstack(self, unstacker, fill_value) -> BlockManager: + def unstack(self, unstacker, fill_value) -> tuple[BlockManager, list[np.ndarray]]: """ Return a BlockManager with all blocks unstacked. @@ -1677,7 +1679,7 @@ def unstack(self, unstacker, fill_value) -> BlockManager: new_columns = new_columns[columns_mask] bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) - return bm + return bm, columns_mask def to_dict(self, copy: bool = True): """ @@ -1697,7 +1699,7 @@ def to_dict(self, copy: bool = True): bd.setdefault(str(b.dtype), []).append(b) # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} + return {dtype: self._combine(blocks, copy=copy)[0] for dtype, blocks in bd.items()} def as_array( self, @@ -1858,7 +1860,7 @@ def _consolidate_inplace(self) -> None: def __len__(self) -> int: # TODO: cache? would need to invalidate akin to blklocs - return sum(x.shape[1] for x in self.blocks) + return sum(x.shape[0] for x in self.blocks) class SingleBlockManager(BaseBlockManager, SingleDataManager): @@ -2053,8 +2055,9 @@ def array_values(self): def get_numeric_data(self, copy: bool = False): if self._block.is_numeric: - return self.copy(deep=copy) - return self.make_empty() + return self.copy(deep=copy), taker + taker = np.array([], dtype=np.intp) + return self.make_empty(), taker @property def _can_hold_na(self) -> bool: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 85731bbde6d40..3411e674afa6f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1937,7 +1937,7 @@ def _take_new_index( new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) # error: Incompatible return value type # (got "DataFrame", expected "NDFrameT") - return obj._constructor(new_mgr) # type: ignore[return-value] + return obj._constructor(new_mgr, index=new_index, columns=obj.columns) # type: ignore[return-value] else: raise ValueError("'obj' should be either a Series or a DataFrame") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0270a5dd75952..a788bd2f0dbd7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -500,8 +500,10 @@ def _unstack_frame(obj: DataFrame, level, fill_value=None): unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor) if not obj._can_fast_transpose: - mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) - return obj._constructor(mgr) + mgr, columns_mask = obj._mgr.unstack(unstacker, fill_value=fill_value) + new_columns = unstacker.get_new_columns(obj.columns) + new_columns = new_columns[columns_mask] + return obj._constructor(mgr, index=unstacker.new_index, columns=new_columns) else: return unstacker.get_result( obj._values, value_columns=obj.columns, fill_value=fill_value diff --git a/pandas/core/series.py b/pandas/core/series.py index f1ed825dd4f2c..8f5802d0b5497 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2360,6 +2360,7 @@ def drop_duplicates( result = super().drop_duplicates(keep=keep) if inplace: self._update_inplace(result) + self._index = result.index return None else: return result diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b30b27f5bae1a..ecf9aacae2c72 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -714,7 +714,7 @@ def test_consolidate_ordering_issues(self, mgr): # we have datetime/tz blocks in mgr cons = mgr.consolidate() assert cons.nblocks == 4 - cons = mgr.consolidate().get_numeric_data() + cons = mgr.consolidate().get_numeric_data()[0] assert cons.nblocks == 1 assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( @@ -752,7 +752,7 @@ def test_get_numeric_data(self, using_copy_on_write): ) mgr.iset(5, np.array([1, 2, 3], dtype=np.object_)) - numeric = mgr.get_numeric_data() + numeric = mgr.get_numeric_data()[0] tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("float")).internal_values(), @@ -776,7 +776,7 @@ def test_get_numeric_data(self, using_copy_on_write): np.array([100.0, 200.0, 300.0]), ) - numeric2 = mgr.get_numeric_data(copy=True) + numeric2 = mgr.get_numeric_data(copy=True)[0] tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) numeric2.iset( numeric2.items.get_loc("float"), @@ -804,7 +804,7 @@ def test_get_bool_data(self, using_copy_on_write): mgr.iset(6, np.array([True, False, True], dtype=np.object_)) with tm.assert_produces_warning(FutureWarning, match=msg): - bools = mgr.get_bool_data() + bools = mgr.get_bool_data()[0] tm.assert_index_equal(bools.items, Index(["bool", "dt"])) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), @@ -825,7 +825,7 @@ def test_get_bool_data(self, using_copy_on_write): # Check sharing with tm.assert_produces_warning(FutureWarning, match=msg): - bools2 = mgr.get_bool_data(copy=True) + bools2 = mgr.get_bool_data(copy=True)[0] bools2.iset(0, np.array([False, True, False])) if using_copy_on_write: tm.assert_numpy_array_equal( diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index b64c7bec6ea39..7ed3273b2e6a0 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -22,6 +22,7 @@ def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) + # TODO: is the comment below still accurate for supported numpies? # __array_interface__ is not defined for older numpies # and on some pythons try: From e8ce5e8b1aaedf52687dbcd59000fcd575872835 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 16 Sep 2022 13:44:23 -0700 Subject: [PATCH 4/4] down to 322 fails --- pandas/core/frame.py | 13 ++++--------- pandas/core/generic.py | 13 +++++++++---- pandas/core/groupby/generic.py | 19 ++++++++++++++++--- pandas/core/groupby/groupby.py | 18 ++++++++++++------ pandas/core/internals/array_manager.py | 18 ++++++++++-------- pandas/core/internals/base.py | 2 +- pandas/core/internals/managers.py | 9 ++++++--- 7 files changed, 58 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 76b01b069d2dd..9808b4def6e6f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4760,7 +4760,8 @@ def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: and not is_bool_dtype(dtype) ) - def predicate(dtype: DtypeObj) -> bool: + def predicate(arr: ArrayLike) -> bool: + dtype = arr.dtype if include: if not dtype_predicate(dtype, include): return False @@ -4771,14 +4772,8 @@ def predicate(dtype: DtypeObj) -> bool: return True - def arr_predicate(arr: ArrayLike) -> bool: - dtype = arr.dtype - return predicate(dtype) - - mgr, taker = self._mgr._get_data_subset(arr_predicate).copy(deep=None) - # FIXME: get axes without mgr.axes - # FIXME: return taker from _get_data_subset, this is really slow - #taker = self.dtypes.apply(predicate).values.nonzero()[0] + mgr, taker = self._mgr._get_data_subset(predicate) + mgr = mgr.copy(deep=None) columns = self.columns.take(taker) return type(self)(mgr, columns=columns, index=self.index).__finalize__(self) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a4b65b4e91149..f744d0a2cc7a6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -840,7 +840,7 @@ def _validate_set_axis(self, axis: int, new_labels: Index) -> None: old_len = self.shape[axis] new_len = len(new_labels) - if axis == 1 and len(self.columns) == 0: + if self.ndim > 1 and axis == 0 and len(self.columns) == 0: # If we are setting the index on a DataFrame with no columns, # it is OK to change the length. pass @@ -3933,6 +3933,14 @@ def _take( convert_indices=convert_indices, ) + # We have 6 tests that get here with a slice; TODO: maybe avoid? + # TODO: de-duplicate with similar inside BlockManager.take + indices = ( + np.arange(indices.start, indices.stop, indices.step, dtype=np.intp) + if isinstance(indices, slice) + else np.asanyarray(indices, dtype=np.intp) # <- converts some cases with empty float64 + ) + axes_dict = self._construct_axes_dict() if convert_indices and isinstance(indices, np.ndarray): # i.e. exclude slice, which in principle shouldn't be in a _take @@ -5490,9 +5498,6 @@ def _reindex_with_indexers( if copy and new_data is self._mgr: new_data = new_data.copy() - # FIXME: get axes without mgr.axes - #axes_dict = self._get_axes_from_mgr(new_data) - return self._constructor(new_data, **axes_dict).__finalize__(self) def filter( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 128f2372f27d8..7326a79a9654b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -175,6 +175,7 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series: else: mgr = cast(Manager2D, mgr) single = mgr.iget(0) + #breakpoint() # FIXME: get axes without mgr.axes index = single.axes[0] ser = self.obj._constructor(single, index=index, name=self.obj.name) @@ -1329,14 +1330,26 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # We could use `mgr.apply` here and not have to set_axis, but # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) + res_mgr, taker = mgr.grouped_reduce(arr_func, ignore_failures=True) res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < orig_mgr_len: warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) - # FIXME: get axes without mgr.axes - res_df = self.obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0]) + columns = mgr.axes[0] + index = res_mgr.axes[1] # FIXME: get index without res_mgr.axes + if self.axis == 0: + + pass#index = self._obj_with_exclusions.index + #columns = columns[taker] + #breakpoint() + else: + #columns = self._obj_with_exclusions.index + pass#index = self._obj_with_exclusions.columns + #breakpoint() + + columns = columns[taker] + res_df = self.obj._constructor(res_mgr, index=index, columns=columns) if self.axis == 1: res_df = res_df.T return res_df diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 43312cd9f5554..ff46904758025 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1780,7 +1780,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block - new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) + new_mgr, taker = data.grouped_reduce(array_func, ignore_failures=ignore_failures) if not is_ser and len(new_mgr) < orig_len: warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) @@ -2055,7 +2055,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: return counted[0] return counted - new_mgr = data.grouped_reduce(hfunc) + new_mgr, taker = data.grouped_reduce(hfunc) # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in @@ -3374,7 +3374,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: mgr = self._get_data_to_aggregate() data = mgr.get_numeric_data()[0] if numeric_only_bool else mgr ignore_failures = numeric_only_bool - res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) + res_mgr, taker = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) if ( numeric_only is lib.no_default @@ -3401,6 +3401,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: axes_dict["index"] = res_mgr.axes[-1] if res_mgr.ndim == 2: axes_dict["columns"] = res_mgr.axes[0] + #breakpoint() res = obj._constructor(res_mgr, **axes_dict) if orig_scalar: @@ -3693,7 +3694,7 @@ def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT: skipna = kwargs.get("skipna", True) if axis != 0: f = lambda x: np.minimum.accumulate(x, axis) - numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) + numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) # TODO: "cummin"? obj = self._selected_obj if numeric_only_bool: obj = obj._get_numeric_data() @@ -3853,7 +3854,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: if numeric_only_bool: mgr = mgr.get_numeric_data()[0] - res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) + res_mgr, taker = mgr.grouped_reduce(blk_func, ignore_failures=True) if not is_ser and len(res_mgr.items) != orig_mgr_len: howstr = how.replace("group_", "") @@ -3871,7 +3872,12 @@ def blk_func(values: ArrayLike) -> ArrayLike: out = self._wrap_agged_manager(res_mgr) else: # FIXME: get axes without mgr.axes - out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0]) + if self.axis == 0 and not numeric_only_bool: + columns = self._obj_with_exclusions.columns[taker] + else: + #breakpoint() + columns = res_mgr.axes[0] + out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=columns) return self._wrap_aggregated_output(out) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index d9154aa8b212b..9e0e832b8f3a5 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -464,7 +464,7 @@ def is_view(self) -> bool: def is_single_block(self) -> bool: return len(self.arrays) == 1 - def _get_data_subset(self: T, predicate: Callable) -> T: + def _get_data_subset(self: T, predicate: Callable) -> tuple[T, npt.NDArray[np.intp]]: indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] arrays = [self.arrays[i] for i in indices] # TODO copy? @@ -473,9 +473,9 @@ def _get_data_subset(self: T, predicate: Callable) -> T: taker = np.array(indices, dtype="intp") new_cols = self._axes[1].take(taker) new_axes = [self._axes[0], new_cols] - return type(self)(arrays, new_axes, verify_integrity=False) + return type(self)(arrays, new_axes, verify_integrity=False), taker - def get_bool_data(self: T, copy: bool = False) -> T: + def get_bool_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]: """ Select columns that are bool-dtype and object-dtype columns that are all-bool. @@ -485,9 +485,8 @@ def get_bool_data(self: T, copy: bool = False) -> T: Whether to copy the blocks """ return self._get_data_subset(is_inferred_bool_dtype) - # FIXME: return indexer - def get_numeric_data(self: T, copy: bool = False) -> T: + def get_numeric_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]: """ Select columns that have a numeric dtype. @@ -935,7 +934,7 @@ def idelete(self, indexer) -> ArrayManager: # -------------------------------------------------------------------- # Array-wise Operation - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]: """ Apply grouped reduction function columnwise, returning a new ArrayManager. @@ -948,6 +947,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: Returns ------- ArrayManager + np.ndarray[intp] """ result_arrays: list[np.ndarray] = [] result_indices: list[int] = [] @@ -975,14 +975,16 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: else: index = Index(range(result_arrays[0].shape[0])) + taker = None if ignore_failures: - columns = self.items[np.array(result_indices, dtype="int64")] + taker = np.array(result_indices, dtype=np.intp) + columns = self.items[taker] else: columns = self.items # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; # expected "List[Union[ndarray, ExtensionArray]]" - return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] + return type(self)(result_arrays, [index, columns]), taker # type: ignore[arg-type] def reduce( self: T, func: Callable, ignore_failures: bool = False diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 995f9423cb5cb..f082f2c3778d4 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -203,7 +203,7 @@ def grouped_reduce(self, func, ignore_failures: bool = False): index = default_index(len(res)) mgr = type(self).from_array(res, index) - return mgr + return mgr, np.arange(len(res), dtype=np.intp) # TODO: is taker meaningful here? @classmethod def from_array(cls, arr: ArrayLike, index: Index): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e3758dc5574c1..130f70fb9fb2a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1475,7 +1475,7 @@ def idelete(self, indexer) -> BlockManager: # ---------------------------------------------------------------- # Block-wise Operation - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]: """ Apply grouped reduction function blockwise, returning a new BlockManager. @@ -1488,6 +1488,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: Returns ------- BlockManager + np.ndarray[intp] """ result_blocks: list[Block] = [] dropped_any = False @@ -1522,9 +1523,10 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: if dropped_any: # faster to skip _combine if we haven't dropped any blocks - return self._combine(result_blocks, copy=False, index=index)[0] + return self._combine(result_blocks, copy=False, index=index) - return type(self).from_blocks(result_blocks, [self.axes[0], index]) + taker = np.arange(len(self), dtype=np.intp) + return type(self).from_blocks(result_blocks, [self.axes[0], index]), taker def reduce( self: T, func: Callable, ignore_failures: bool = False @@ -2055,6 +2057,7 @@ def array_values(self): def get_numeric_data(self, copy: bool = False): if self._block.is_numeric: + taker = np.arange(len(self.items), dtype=np.intp) return self.copy(deep=copy), taker taker = np.array([], dtype=np.intp) return self.make_empty(), taker