Skip to content

POC/REF: remove axes from Managers #48126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pandas/_libs/properties.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ cdef class AxisProperty:
if obj is None:
# Only instances have _mgr, not classes
return self
if self.axis == 0:
return obj._index
else:
axes = obj._mgr.axes
return axes[self.axis]
return obj._columns

def __set__(self, obj, value):
obj._set_axis(self.axis, value)
7 changes: 5 additions & 2 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,8 @@ def apply(self) -> DataFrame | Series:
with np.errstate(all="ignore"):
results = self.obj._mgr.apply("apply", func=self.f)
# _constructor will retain self.index and self.columns
return self.obj._constructor(data=results)
axes_dict = self.obj._construct_axes_dict()
return self.obj._constructor(data=results, **axes_dict)

# broadcasting
if self.result_type == "broadcast":
Expand Down Expand Up @@ -1001,6 +1002,7 @@ def series_generator(self):
# We create one Series object, and will swap out the data inside
# of it. Kids: don't do this at home.
ser = self.obj._ixs(0, axis=0)
index = ser.index
mgr = ser._mgr

if is_extension_array_dtype(ser.dtype):
Expand All @@ -1012,9 +1014,10 @@ def series_generator(self):

else:
for (arr, name) in zip(values, self.index):
# GH#35462 re-pin mgr in case setitem changed it
# GH#35462 re-pin mgr, index in case setitem changed it
ser._mgr = mgr
mgr.set_values(arr)
ser._index = index
object.__setattr__(ser, "_name", name)
yield ser

Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arraylike.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ def _reconstruct(result):
return result
if isinstance(result, BlockManager):
# we went through BlockManager.apply e.g. np.sqrt
# TODO: any cases that aren't index/columns-preserving?
reconstruct_kwargs.update(self._construct_axes_dict())
result = self._constructor(result, **reconstruct_kwargs, copy=False)
else:
# we converted an array, lost our axes
Expand Down
109 changes: 81 additions & 28 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ class DataFrame(NDFrame, OpsMixin):
2 2 3
"""

_internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
_internal_names_set = {"_columns", "columns", "_index", "index"} | NDFrame._internal_names_set
_typ = "dataframe"
_HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
_accessors: set[str] = {"sparse"}
Expand Down Expand Up @@ -621,11 +621,25 @@ def __init__(
dtype = self._validate_dtype(dtype)

if isinstance(data, DataFrame):
if index is None:
index = data.index
if columns is None:
columns = data.columns
data = data._mgr

if isinstance(data, (BlockManager, ArrayManager)):
# first check if a Manager is passed without any other arguments
# -> use fastpath (without checking Manager type)
if index is None or columns is None:
assert False
if data.axes[0] is not columns or data.axes[1] is not index:
# FIXME: without this check, json tests segfault...
# nope, segfaults even with this check
data.axes = [ensure_index(columns), ensure_index(index)]
#if not index.equals(data.axes[-1]):#index is not data.axes[-1]:
# assert False
#if not columns.equals(data.axes[0]):#columns is not data.axes[0]:
# assert False
if index is None and columns is None and dtype is None and not copy:
# GH#33357 fastpath
NDFrame.__init__(self, data)
Expand Down Expand Up @@ -751,7 +765,7 @@ def __init__(
index, # type: ignore[arg-type]
dtype,
)
mgr = arrays_to_mgr(
mgr, _, _ = arrays_to_mgr(
arrays,
columns,
index,
Expand Down Expand Up @@ -794,7 +808,7 @@ def __init__(
construct_1d_arraylike_from_scalar(data, len(index), dtype)
for _ in range(len(columns))
]
mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
mgr, _, _ = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
else:
arr2d = construct_2d_arraylike_from_scalar(
data,
Expand Down Expand Up @@ -2399,9 +2413,9 @@ def maybe_reorder(
columns = columns.drop(exclude)

manager = get_option("mode.data_manager")
mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
mgr, index, columns = arrays_to_mgr(arrays, columns, result_index, typ=manager)

return cls(mgr)
return cls(mgr, index=index, columns=columns)

def to_records(
self, index: bool = True, column_dtypes=None, index_dtypes=None
Expand Down Expand Up @@ -2603,15 +2617,15 @@ def _from_arrays(
columns = ensure_index(columns)
if len(columns) != len(arrays):
raise ValueError("len(columns) must match len(arrays)")
mgr = arrays_to_mgr(
mgr, index, columns = arrays_to_mgr(
arrays,
columns,
index,
dtype=dtype,
verify_integrity=verify_integrity,
typ=manager,
)
return cls(mgr)
return cls(mgr, index=index, columns=columns)

@doc(
storage_options=_shared_docs["storage_options"],
Expand Down Expand Up @@ -3729,7 +3743,7 @@ def _ixs(self, i: int, axis: int = 0) -> Series:

# if we are a copy, mark as such
copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(
result = self._constructor_sliced(new_mgr, index=self.columns, name=self.index[i]).__finalize__(
self
)
result._set_is_copy(self, copy=copy)
Expand Down Expand Up @@ -4154,6 +4168,7 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
except KeyError:
# This item wasn't present, just insert at end
self._mgr.insert(len(self._info_axis), key, value)
self._columns = self.columns.insert(len(self._info_axis), key)
else:
self._iset_item_mgr(loc, value)

Expand Down Expand Up @@ -4257,6 +4272,7 @@ def _ensure_valid_index(self, value) -> None:
index_copy.name = self.index.name

self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
self._index = index_copy

def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
"""
Expand All @@ -4267,7 +4283,7 @@ def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
name = self.columns[loc]
klass = self._constructor_sliced
# We get index=self.index bc values is a SingleDataManager
return klass(values, name=name, fastpath=True).__finalize__(self)
return klass(values, name=name, index=self.index, fastpath=True).__finalize__(self)

# ----------------------------------------------------------------------
# Lookup Caching
Expand Down Expand Up @@ -4486,6 +4502,8 @@ def query(self, expr: str, inplace: bool = False, **kwargs) -> DataFrame | None:

if inplace:
self._update_inplace(result)
self._index = result._index
self._columns = result._columns
return None
else:
return result
Expand Down Expand Up @@ -4754,8 +4772,10 @@ def predicate(arr: ArrayLike) -> bool:

return True

mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
return type(self)(mgr).__finalize__(self)
mgr, taker = self._mgr._get_data_subset(predicate)
mgr = mgr.copy(deep=None)
columns = self.columns.take(taker)
return type(self)(mgr, columns=columns, index=self.index).__finalize__(self)

def insert(
self,
Expand Down Expand Up @@ -4824,6 +4844,7 @@ def insert(

value = self._sanitize_column(value)
self._mgr.insert(loc, column, value)
self._columns = self.columns.insert(loc, column)

def assign(self, **kwargs) -> DataFrame:
r"""
Expand Down Expand Up @@ -5855,7 +5876,7 @@ def shift(
fill_value=fill_value,
allow_dups=True,
)
res_df = self._constructor(mgr)
res_df = self._constructor(mgr, columns=self.columns, index=self.index)
return res_df.__finalize__(self, method="shift")

return super().shift(
Expand Down Expand Up @@ -6382,7 +6403,8 @@ class max type

@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
def isna(self) -> DataFrame:
result = self._constructor(self._mgr.isna(func=isna))
axes_dict = self._construct_axes_dict()
result = self._constructor(self._mgr.isna(func=isna), **axes_dict)
return result.__finalize__(self, method="isna")

@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
Expand Down Expand Up @@ -6587,6 +6609,8 @@ def dropna(
if not inplace:
return result
self._update_inplace(result)
self._columns = result._columns
self._index = result._index
return None

@deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"])
Expand Down Expand Up @@ -6685,6 +6709,8 @@ def drop_duplicates(

if inplace:
self._update_inplace(result)
self._index = result._index
self._columns = result._columns
return None
else:
return result
Expand Down Expand Up @@ -6934,16 +6960,27 @@ def sort_values( # type: ignore[override]
else:
return self.copy()

bm_axis = self._get_block_manager_axis(axis)

new_data = self._mgr.take(
indexer, axis=self._get_block_manager_axis(axis), verify=False
indexer, axis=bm_axis, verify=False
)

axis_name = self._get_axis_name(axis)

axes_dict = {}
axes_dict[axis_name] = self.axes[axis].take(indexer)
if axis == 0:
axes_dict["columns"] = self.columns
else:
axes_dict["index"] = self.index

if ignore_index:
new_data.set_axis(
self._get_block_manager_axis(axis), default_index(len(indexer))
)
rng = default_index(len(indexer))
new_data.set_axis(bm_axis, rng)
axes_dict[axis_name] = rng

result = self._constructor(new_data)
result = self._constructor(new_data, **axes_dict)
if inplace:
return self._update_inplace(result)
else:
Expand Down Expand Up @@ -7627,7 +7664,7 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
# i.e. scalar, faster than checking np.ndim(right) == 0
with np.errstate(all="ignore"):
bm = self._mgr.apply(array_op, right=right)
return self._constructor(bm)
return self._constructor(bm, index=self.index, columns=self.columns)

elif isinstance(right, DataFrame):
assert self.index.equals(right.index)
Expand All @@ -7648,7 +7685,7 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
right._mgr, # type: ignore[arg-type]
array_op,
)
return self._constructor(bm)
return self._constructor(bm, index=self.index, columns=self.columns)

elif isinstance(right, Series) and axis == 1:
# axis=1 means we want to operate row-by-row
Expand Down Expand Up @@ -9239,7 +9276,7 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
axis = 0

new_data = self._mgr.diff(n=periods, axis=axis)
return self._constructor(new_data).__finalize__(self, "diff")
return self._constructor(new_data, index=self.index, columns=self.columns).__finalize__(self, "diff")

# ----------------------------------------------------------------------
# Function application
Expand Down Expand Up @@ -10850,8 +10887,9 @@ def _reduce(
# cols = self.columns[~dt64_cols]
# self = self[cols]
predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
mgr = self._mgr._get_data_subset(predicate)
self = type(self)(mgr)
mgr, taker = self._mgr._get_data_subset(predicate)
columns = self.columns[taker]
self = type(self)(mgr, index=self.index, columns=columns)

# TODO: Make other agg func handle axis=None properly GH#21597
axis = self._get_axis_number(axis)
Expand Down Expand Up @@ -10900,7 +10938,20 @@ def _get_data() -> DataFrame:
# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures)
out = df._constructor(res).iloc[0]
index = Index([None], dtype=object)
assert index.equals(res.axes[1])
if ignore_failures:
if len(res.items) == len(df.columns):
# i.e. nothing was dropped
columns = df.columns
else:
# FIXME: get axes without mgr.axes; THIS IS WRONG TOO
columns = res.axes[0]
else:
columns = df.columns
assert columns.equals(res.axes[0])

out = df._constructor(res, index=index, columns=columns).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
Expand Down Expand Up @@ -11398,7 +11449,8 @@ def quantile(
res = data._mgr.take(indexer[q_idx], verify=False)
res.axes[1] = q

result = self._constructor(res)
# FIXME: get axes without mgr.axes
result = self._constructor(res, columns=res.axes[0], index=res.axes[1])
return result.__finalize__(self, method="quantile")

@doc(NDFrame.asfreq, **_shared_doc_kwargs)
Expand Down Expand Up @@ -11665,9 +11717,9 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
_info_axis_name = "columns"

index = properties.AxisProperty(
axis=1, doc="The index (row labels) of the DataFrame."
axis=0, doc="The index (row labels) of the DataFrame."
)
columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")
columns = properties.AxisProperty(axis=1, doc="The column labels of the DataFrame.")

@property
def _AXIS_NUMBERS(self) -> dict[str, int]:
Expand Down Expand Up @@ -11702,8 +11754,9 @@ def _to_dict_of_blocks(self, copy: bool = True):
# convert to BlockManager if needed -> this way support ArrayManager as well
mgr = mgr_to_mgr(mgr, "block")
mgr = cast(BlockManager, mgr)
# FIXME: get axes without mgr.axes
return {
k: self._constructor(v).__finalize__(self)
k: self._constructor(v, index=self.index, columns=v.axes[0]).__finalize__(self)
for k, v, in mgr.to_dict(copy=copy).items()
}

Expand Down
Loading