Skip to content

Commit

Permalink
Add ensure_index to not unnecessarily shallow copy cudf.Index (rapids…
Browse files Browse the repository at this point in the history
…ai#16117)

The `cudf.Index` constructor will shallow copy a `cudf.Index` input. Sometimes, we just need to make sure an input is a `cudf.Index`, so created `ensure_index` (pandas has something similar) so we don't shallow copy these inputs unnecessarily

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: rapidsai#16117
  • Loading branch information
mroeschke authored Jun 28, 2024
1 parent 57862a3 commit 2b547dc
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 31 deletions.
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,7 +1104,11 @@ def difference(self, other, sort=None):
f"of [None, False, True]; {sort} was passed."
)

other = cudf.Index(other, name=getattr(other, "name", self.name))
if not isinstance(other, BaseIndex):
other = cudf.Index(
other,
name=getattr(other, "name", self.name),
)

if not len(other):
res = self._get_reconciled_name_object(other).unique()
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask
from cudf.core.index import Index, RangeIndex
from cudf.core.index import RangeIndex, ensure_index
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.options import get_option
Expand Down Expand Up @@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

return labels, cats.values if return_cupy_array else Index(cats)
return labels, cats.values if return_cupy_array else ensure_index(cats)


def _linear_interpolation(column, index=None):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def cut(
)

# we return a categorical index, as we don't have a Categorical method
categorical_index = cudf.Index(col)
categorical_index = cudf.CategoricalIndex._from_data({None: col})

if isinstance(orig_x, (pd.Series, cudf.Series)):
# if we have a series input we return a series output
Expand Down
29 changes: 17 additions & 12 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.copy_types import BooleanMask
from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
from cudf.core.index import (
BaseIndex,
RangeIndex,
_index_from_data,
ensure_index,
)
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
Expand Down Expand Up @@ -338,7 +343,7 @@ def _getitem_tuple_arg(self, arg):
range(len(tmp_arg[0]))
)
},
index=as_index(tmp_arg[0]),
index=cudf.Index(tmp_arg[0]),
)
columns_df[cantor_name] = column.as_column(
range(len(columns_df))
Expand Down Expand Up @@ -702,7 +707,7 @@ def __init__(
data = data.reindex(index)
index = data.index
else:
index = cudf.Index(index)
index = ensure_index(index)
else:
index = data.index

Expand Down Expand Up @@ -751,7 +756,7 @@ def __init__(
if index is None:
self._index = RangeIndex(0)
else:
self._index = cudf.Index(index)
self._index = ensure_index(index)
if columns is not None:
rangeindex = isinstance(
columns, (range, pd.RangeIndex, cudf.RangeIndex)
Expand Down Expand Up @@ -909,7 +914,7 @@ def _init_from_series_list(self, data, columns, index):
f"not match length of index ({index_length})"
)

final_index = cudf.Index(index)
final_index = ensure_index(index)

series_lengths = list(map(len, data))
data = numeric_normalize_types(*data)
Expand Down Expand Up @@ -977,9 +982,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
if index is None:
index = RangeIndex(start=0, stop=len(data))
else:
index = cudf.Index(index)
index = ensure_index(index)

self._index = cudf.Index(index)
self._index = index
# list-of-dicts case
if len(data) > 0 and isinstance(data[0], dict):
data = DataFrame.from_pandas(pd.DataFrame(data))
Expand Down Expand Up @@ -1085,7 +1090,7 @@ def _init_from_dict_like(

self._index = RangeIndex(0, num_rows)
else:
self._index = cudf.Index(index)
self._index = ensure_index(index)

if len(data):
self._data.multiindex = True
Expand Down Expand Up @@ -1491,7 +1496,7 @@ def memory_usage(self, index=True, deep=False):
names.append("Index")
return Series._from_data(
data={None: as_column(mem_usage)},
index=as_index(names),
index=cudf.Index(names),
)

@_performance_tracking
Expand Down Expand Up @@ -4033,7 +4038,7 @@ def transpose(self):
# Set the old column names as the new index
result = self.__class__._from_data(
ColumnAccessor(dict(enumerate(result_columns)), verify=False),
index=as_index(index),
index=cudf.Index(index),
)
# Set the old index as the new column names
result.columns = columns
Expand Down Expand Up @@ -5657,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
}

if not is_scalar(index):
new_index = cudf.Index(index)
new_index = ensure_index(index)
else:
new_index = None

Expand Down Expand Up @@ -5741,7 +5746,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
}

if index is not None:
index = cudf.Index(index)
index = ensure_index(index)

if isinstance(columns, (pd.Index, cudf.Index)):
level_names = tuple(columns.names)
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@
from collections.abc import Generator, Iterable


def ensure_index(index_like: Any) -> BaseIndex:
"""
Ensure an Index is returned.
Avoids a shallow copy compared to calling cudf.Index(...)
"""
if not isinstance(index_like, BaseIndex):
return cudf.Index(index_like)
return index_like


class IndexMeta(type):
"""Custom metaclass for Index that overrides instance/subclass tests."""

Expand Down Expand Up @@ -1569,7 +1580,7 @@ def append(self, other):
to_concat.append(obj)
else:
this = self
other = cudf.Index(other)
other = ensure_index(other)

if len(this) == 0 or len(other) == 0:
# we'll filter out empties later in ._concat
Expand Down
11 changes: 5 additions & 6 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
is_list_like,
is_scalar,
)
from cudf.core._base_index import BaseIndex
from cudf.core._compat import PANDAS_LT_300
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import ColumnBase, as_column
Expand All @@ -42,7 +41,7 @@
from cudf.core.dtypes import ListDtype
from cudf.core.frame import Frame
from cudf.core.groupby.groupby import GroupBy
from cudf.core.index import Index, RangeIndex, _index_from_data
from cudf.core.index import RangeIndex, _index_from_data, ensure_index
from cudf.core.missing import NA
from cudf.core.multiindex import MultiIndex
from cudf.core.resample import _Resampler
Expand All @@ -66,6 +65,8 @@
Dtype,
NotImplementedType,
)
from cudf.core._base_index import BaseIndex


doc_reset_index_template = """
Reset the index of the {klass}, or a level of it.
Expand Down Expand Up @@ -627,9 +628,7 @@ def index(self, value):
f"new values have {len(value)} elements"
)
# avoid unnecessary cast to Index
if not isinstance(value, BaseIndex):
value = Index(value)

value = ensure_index(value)
self._index = value

@_performance_tracking
Expand Down Expand Up @@ -3595,7 +3594,7 @@ def _align_to_index(
sort: bool = True,
allow_non_unique: bool = False,
) -> Self:
index = cudf.Index(index)
index = ensure_index(index)

if self.index.equals(index):
return self
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BaseIndex,
_get_indexer_basic,
_lexsorted_equal_range,
ensure_index,
)
from cudf.core.join._join_helpers import _match_join_keys
from cudf.utils.dtypes import is_column_like
Expand Down Expand Up @@ -173,7 +174,7 @@ def __init__(
"codes and is inconsistent!"
)

levels = [cudf.Index(level) for level in levels]
levels = [ensure_index(level) for level in levels]

if len(levels) != len(codes._data):
raise ValueError(
Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from cudf.core.column.struct import StructMethods
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template
from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, as_index
from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
Expand Down Expand Up @@ -588,10 +588,8 @@ def __init__(
data = data.copy(deep=True)
name_from_data = data.name
column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
if isinstance(data, pd.Series):
index_from_data = cudf.Index(data.index)
elif isinstance(data, Series):
index_from_data = data.index
if isinstance(data, (pd.Series, Series)):
index_from_data = ensure_index(data.index)
elif isinstance(data, ColumnAccessor):
raise TypeError(
"Use cudf.Series._from_data for constructing a Series from "
Expand Down Expand Up @@ -642,7 +640,7 @@ def __init__(
name = name_from_data

if index is not None:
index = cudf.Index(index)
index = ensure_index(index)

if index_from_data is not None:
first_index = index_from_data
Expand Down Expand Up @@ -3191,7 +3189,7 @@ def quantile(

return Series._from_data(
data={self.name: result},
index=as_index(np_array_q) if quant_index else None,
index=cudf.Index(np_array_q) if quant_index else None,
)

@docutils.doc_describe()
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11078,3 +11078,27 @@ def test_dataframe_loc_int_float(dtype1, dtype2):
expected = pdf.loc[pidx]

assert_eq(actual, expected, check_index_type=True, check_dtype=True)


@pytest.mark.parametrize(
"data",
[
cudf.DataFrame(range(2)),
None,
[cudf.Series(range(2))],
[[0], [1]],
{1: range(2)},
cupy.arange(2),
],
)
def test_init_with_index_no_shallow_copy(data):
idx = cudf.RangeIndex(2)
df = cudf.DataFrame(data, index=idx)
assert df.index is idx


def test_from_records_with_index_no_shallow_copy():
idx = cudf.RangeIndex(2)
data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
df = cudf.DataFrame(data.view(np.recarray), index=idx)
assert df.index is idx

0 comments on commit 2b547dc

Please sign in to comment.