From 8a397815871dff27601e46b7bc3542a78e6d8600 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:44:28 -0700 Subject: [PATCH 1/8] Disallow column in the Series constructor --- python/cudf/cudf/core/column/methods.py | 16 ++++++------- python/cudf/cudf/core/column/string.py | 25 ++++++++------------ python/cudf/cudf/core/series.py | 12 ++++++---- python/cudf/cudf/core/tools/datetimes.py | 13 ++++++++--- python/cudf/cudf/core/tools/numeric.py | 29 +++++++++--------------- python/cudf/cudf/tests/test_string.py | 2 +- 6 files changed, 48 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 7c6f4e05577..5abaef207a4 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -7,6 +7,8 @@ from typing_extensions import Literal import cudf +import cudf.core.column +import cudf.core.column_accessor from cudf.utils.utils import NotIterable ParentType = Union["cudf.Series", "cudf.core.index.Index"] @@ -84,14 +86,12 @@ def _return_or_inplace( data=table, index=self._parent.index ) elif isinstance(self._parent, cudf.Series): - if retain_index: - return cudf.Series( - new_col, - name=self._parent.name, - index=self._parent.index, - ) - else: - return cudf.Series(new_col, name=self._parent.name) + ca = cudf.core.column_accessor.ColumnAccessor( + {self._parent.name: new_col}, verify=False + ) + return cudf.Series._from_data( + ca, self._parent.index if retain_index else None + ) elif isinstance(self._parent, cudf.BaseIndex): return cudf.Index(new_col, name=self._parent.name) else: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ec95c50f455..bf4497dc388 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -358,7 +358,7 @@ def cat(self, others=None, sep=None, na_rep=None): ) if len(data) == 1 and data.null_count == 1: - data = [""] + data = cudf.core.column.as_column("", length=len(data)) # We only want to keep the index if we are adding something to each # row, not if we are joining all the rows into a single string. out = self._return_or_inplace(data, retain_index=others is not None) @@ -3623,7 +3623,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) - def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series": + def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ Find all first occurrences of patterns in the Series/Index. @@ -3679,13 +3679,13 @@ def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series": f"got: {patterns_column.dtype}" ) - return cudf.Series( + result = self._return_or_inplace( libstrings.find_multiple(self._column, patterns_column), - index=self._parent.index - if isinstance(self._parent, cudf.Series) - else self._parent, - name=self._parent.name, + retain_index=True, ) + if isinstance(result, cudf.Index): + result = cudf.Series(result, index=self._parent) + return cast(cudf.Series, result) def isempty(self) -> SeriesOrIndex: """ @@ -4376,14 +4376,9 @@ def code_points(self) -> SeriesOrIndex: 2 99 dtype: int32 """ - - new_col = libstrings.code_points(self._column) - if isinstance(self._parent, cudf.Series): - return cudf.Series(new_col, name=self._parent.name) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index(new_col, name=self._parent.name) - else: - return new_col + return self._return_or_inplace( + libstrings.code_points(self._column), retain_index=False + ) def translate(self, table: dict) -> SeriesOrIndex: """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 10ac1fdfc1e..aa3671c84b0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -586,10 +586,10 @@ def __init__( column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) if isinstance(data, (pd.Series, Series)): index_from_data = ensure_index(data.index) - elif isinstance(data, ColumnAccessor): + elif isinstance(data, (ColumnAccessor, ColumnBase)): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " - "ColumnAccessor" + "ColumnAccessor or a ColumnBase" ) elif isinstance(data, dict): if not data: @@ -1597,7 +1597,9 @@ def _concat(cls, objs, axis=0, index=True): if len(objs): col = col._with_type_metadata(objs[0].dtype) - return cls(data=col, index=index, name=name) + return cls._from_data( + ColumnAccessor({name: col}, verify=False), index=index + ) @property # type: ignore @_performance_tracking @@ -3036,7 +3038,9 @@ def unique(self): res = self._column.unique() if cudf.get_option("mode.pandas_compatible"): return res.values - return Series(res, name=self.name) + return Series._from_data( + self._data._from_columns_like_self([res], verify=False) + ) @_performance_tracking def value_counts( diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index c6e2b5d10e1..2247837bbcb 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -18,6 +18,8 @@ ) from cudf.api.types import is_integer, is_scalar from cudf.core import column +from cudf.core.column_accessor import ColumnAccessor +from cudf.core.index import ensure_index # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 _unit_map = { @@ -275,7 +277,8 @@ def to_datetime( format=format, utc=utc, ) - return cudf.Series(col, index=arg.index) + ca = ColumnAccessor({None: col}, verify=False) + return cudf.Series._from_data(ca, index=arg.index) else: col = _process_col( col=column.as_column(arg), @@ -286,9 +289,13 @@ def to_datetime( utc=utc, ) if isinstance(arg, (cudf.BaseIndex, pd.Index)): - return cudf.Index(col, name=arg.name) + ca = ColumnAccessor({arg.name: col}, verify=False) + return cudf.DatetimeIndex._from_data(ca) elif isinstance(arg, (cudf.Series, pd.Series)): - return cudf.Series(col, index=arg.index, name=arg.name) + ca = ColumnAccessor({arg.name: col}, verify=False) + return cudf.Series._from_data( + ca, index=ensure_index(arg.index) + ) elif is_scalar(arg): return col.element_indexing(0) else: diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 07158e4ee61..ef6955be643 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -1,6 +1,8 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import warnings +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -10,9 +12,14 @@ from cudf._lib import strings as libstrings from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core.column import as_column +from cudf.core.column_accessor import ColumnAccessor from cudf.core.dtypes import CategoricalDtype +from cudf.core.index import ensure_index from cudf.utils.dtypes import can_convert_to_column +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + def to_numeric(arg, errors="raise", downcast=None): """ @@ -164,7 +171,8 @@ def to_numeric(arg, errors="raise", downcast=None): break if isinstance(arg, (cudf.Series, pd.Series)): - return cudf.Series(col, index=arg.index, name=arg.name) + ca = ColumnAccessor({arg.name: col}, verify=False) + return cudf.Series._from_data(ca, index=ensure_index(arg.index)) else: if col.has_nulls(): # To match pandas, always return a floating type filled with nan. @@ -226,25 +234,10 @@ def _convert_str_col(col, errors, _downcast=None): raise ValueError("Unable to convert some strings to numerics.") -def _proc_inf_empty_strings(col): +def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: """Handles empty and infinity strings""" col = libstrings.to_lower(col) - col = _proc_empty_strings(col) - col = _proc_inf_strings(col) - return col - - -def _proc_empty_strings(col): - """Replaces empty strings with NaN""" - s = cudf.Series(col) - s = s.where(s != "", "NaN") - return s._column - - -def _proc_inf_strings(col): - """Convert "inf/infinity" strings into "Inf", the native string - representing infinity in libcudf - """ + col = col.find_and_replace(as_column([""]), as_column(["NaN"])) # TODO: This can be handled by libcudf in # future see StringColumn.as_numerical_column col = libstrings.replace_multi( diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index f447759d010..1acc0bc2041 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -2677,7 +2677,7 @@ def test_string_int_to_ipv4(): ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"] ) - got = cudf.Series(gsr._column.int2ip()) + got = cudf.Series._from_data({None: gsr._column.int2ip()}) assert_eq(expected, got) From ece3073bd9a77030c490b632c2c29544e0b0e8f5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 16:11:43 -0700 Subject: [PATCH 2/8] More places where Series was used --- python/cudf/cudf/core/byte_pair_encoding.py | 6 +- python/cudf/cudf/core/column/categorical.py | 6 +- python/cudf/cudf/core/column/numerical.py | 12 ++-- python/cudf/cudf/core/column/string.py | 18 +++-- python/cudf/cudf/core/dataframe.py | 29 ++++---- python/cudf/cudf/core/groupby/groupby.py | 11 +-- python/cudf/cudf/core/index.py | 30 ++++++-- python/cudf/cudf/core/indexed_frame.py | 12 ++-- python/cudf/cudf/core/multiindex.py | 12 +++- python/cudf/cudf/core/reshape.py | 8 +-- python/cudf/cudf/core/series.py | 76 ++++++++++++++++---- python/cudf/cudf/core/single_column_frame.py | 29 +------- python/cudf/cudf/core/tokenize_vocabulary.py | 8 ++- python/cudf/cudf/tests/test_apply_rows.py | 8 ++- python/cudf/cudf/tests/test_column.py | 6 +- python/cudf/cudf/tests/test_dataframe.py | 34 +++++---- python/cudf/cudf/tests/test_pickling.py | 8 ++- python/cudf/cudf/tests/test_replace.py | 6 +- python/cudf/cudf/tests/test_series.py | 10 ++- python/cudf/cudf/tests/test_setitem.py | 10 ++- python/cudf/cudf/tests/test_string_udfs.py | 4 +- 21 files changed, 212 insertions(+), 131 deletions(-) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 4c881022ecf..da9366e5b39 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -27,7 +27,7 @@ class BytePairEncoder: def __init__(self, merges_pair: "cudf.Series"): self.merge_pairs = cpp_merge_pairs(merges_pair._column) - def __call__(self, text, separator: str = " "): + def __call__(self, text, separator: str = " ") -> cudf.Series: """ Parameters @@ -56,4 +56,4 @@ def __call__(self, text, separator: str = " "): sep = cudf.Scalar(separator, dtype="str") result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - return cudf.Series(result) + return cudf.Series._from_data({None: result}) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 9433a91b9c6..f8088770be2 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -123,7 +123,7 @@ def categories(self) -> "cudf.core.index.Index": return self._column.dtype.categories @property - def codes(self) -> "cudf.Series": + def codes(self) -> cudf.Series: """ Return Series of codes as well as the index. """ @@ -132,7 +132,7 @@ def codes(self) -> "cudf.Series": if isinstance(self._parent, cudf.Series) else None ) - return cudf.Series(self._column.codes, index=index) + return cudf.Series._from_data({None: self._column.codes}, index=index) @property def ordered(self) -> bool: @@ -914,7 +914,7 @@ def find_and_replace( ) cur_categories = replaced.categories new_categories = cur_categories.apply_boolean_mask( - ~cudf.Series(cur_categories.isin(drop_values)) + cur_categories.isin(drop_values).unary_operator("not") ) replaced = replaced._set_categories(new_categories) df = df.dropna(subset=["new"]) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f9404eb3b40..91f865fb39b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -567,11 +567,8 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: if self.dtype.kind == "f": # Exclude 'np.inf', '-np.inf' - s = cudf.Series(self) - # TODO: replace np.inf with cudf scalar when - # https://github.com/rapidsai/cudf/pull/6297 merges - non_infs = s[~((s == np.inf) | (s == -np.inf))] - col = non_infs._column + not_inf = (self != np.inf) & (self != -np.inf) + col = self.apply_boolean_mask(not_inf) else: col = self @@ -611,8 +608,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: else: filled = self.fillna(0) return ( - cudf.Series(filled).astype(to_dtype).astype(filled.dtype) - == cudf.Series(filled) + filled.astype(to_dtype).astype(filled.dtype) == filled ).all() # want to cast float to int: @@ -627,7 +623,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: # NOTE(seberg): it would make sense to limit to the mantissa range. if (float(self.min()) >= min_) and (float(self.max()) <= max_): filled = self.fillna(0) - return (cudf.Series(filled) % 1 == 0).all() + return (filled % 1 == 0).all() else: return False diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index bf4497dc388..9f1313e1569 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -3679,13 +3679,16 @@ def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: f"got: {patterns_column.dtype}" ) - result = self._return_or_inplace( - libstrings.find_multiple(self._column, patterns_column), - retain_index=True, + return cudf.Series._from_data( + { + self._parent.name: libstrings.find_multiple( + self._column, patterns_column + ) + }, + index=self._parent.index + if isinstance(self._parent, cudf.Series) + else self._parent, ) - if isinstance(result, cudf.Index): - result = cudf.Series(result, index=self._parent) - return cast(cudf.Series, result) def isempty(self) -> SeriesOrIndex: """ @@ -4689,7 +4692,8 @@ def character_tokenize(self) -> SeriesOrIndex: if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) - return cudf.Series(result_col, name=self._parent.name, index=index) + data = {self._parent.name: result_col} + return cudf.Series._from_data(data, index=index) elif isinstance(self._parent, cudf.BaseIndex): return cudf.Index(result_col, name=self._parent.name) else: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6ea11fe9f64..211e7520629 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -508,8 +508,8 @@ def __getitem__(self, arg): new_name = result.index[0] result = Series._concat( [result[name] for name in column_names], - index=result.keys(), ) + result.index = cudf.Index(result.keys()) result.name = new_name return result except TypeError: @@ -1753,7 +1753,7 @@ def _concat( if 1 == first_data_column_position: table_index = cudf.Index(cols[0]) elif first_data_column_position > 1: - table_index = DataFrame._from_data( + table_index = cudf.MultiIndex._from_data( data=dict( zip( indices[:first_data_column_position], @@ -3804,7 +3804,8 @@ def agg(self, aggs, axis=None): col_empty = column_empty( len(idxs), dtype=col.dtype, masked=True ) - ans = cudf.Series(data=col_empty, index=idxs) + ca = ColumnAccessor({None: col_empty}, verify=False) + ans = cudf.Series._from_data(ca, index=cudf.Index(idxs)) if isinstance(aggs.get(key), abc.Iterable): # TODO : Allow simultaneous pass for multi-aggregation # as a future optimization @@ -6073,9 +6074,8 @@ def quantile( if q_is_number: result = result.transpose() - return Series( - data=result._columns[0], index=result.index, name=q - ) + ca = ColumnAccessor({q: result._columns[0]}, verify=False) + return Series._from_data(ca, index=result.index) else: # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" @@ -6700,11 +6700,8 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result = result.set_mask( cudf._lib.transform.bools_to_mask(mask._column) ) - return Series( - result, - index=self.index, - dtype=result_dtype, - ) + ca = ColumnAccessor({None: result}, verify=False) + return Series._from_data(ca, index=self.index) else: result_df = DataFrame(result).set_index(self.index) result_df._set_columns_like(prepared._data) @@ -8474,7 +8471,9 @@ def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): return non_null_columns, dtypes -def _find_common_dtypes_and_categories(non_null_columns, dtypes): +def _find_common_dtypes_and_categories( + non_null_columns, dtypes +) -> dict[Any, ColumnBase]: # A mapping of {idx: categories}, where `categories` is a # column of all the unique categorical values from each # categorical column across all input frames @@ -8490,9 +8489,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): isinstance(col, cudf.core.column.CategoricalColumn) for col in cols ): # Combine and de-dupe the categories - categories[idx] = cudf.Series( - concat_columns([col.categories for col in cols]) - )._column.unique() + categories[idx] = concat_columns( + [col.categories for col in cols] + ).unique() # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_signed_type(len(categories[idx])) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3cfbd1d736a..f1b31b8335c 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -458,12 +458,13 @@ def size(self): """ Return the size of each group. """ - return ( - cudf.Series( - cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ) + data = { + None: cudf.core.column.column_empty( + len(self.obj), "int8", masked=False ) + } + return ( + cudf.Series._from_data(data) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) .agg("size") ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8c3b091abec..404c100ab76 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1091,6 +1091,28 @@ def _from_data_like_self( @classmethod @_performance_tracking def from_arrow(cls, obj): + """Create from PyArrow Array/ChunkedArray. + + Parameters + ---------- + array : PyArrow Array/ChunkedArray + PyArrow Object which has to be converted. + + Raises + ------ + TypeError for invalid input type. + + Returns + ------- + SingleColumnFrame + + Examples + -------- + >>> import cudf + >>> import pyarrow as pa + >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) + Index(['a', 'b', None], dtype='object') + """ try: return cls(ColumnBase.from_arrow(obj)) except TypeError: @@ -1296,22 +1318,22 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _return_get_indexer_result(result.values) scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") - (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) - result_series = cudf.Series(result) + result = libcudf.copying.scatter([indices], scatter_map, [result])[0] + result_series = cudf.Series._from_data({None: result}) if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( index=self, positions=result_series, method=method, - target_col=cudf.Series(needle), + target_col=cudf.Series._from_data({None: needle}), tolerance=tolerance, ) elif method == "nearest": result_series = _get_nearest_indexer( index=self, positions=result_series, - target_col=cudf.Series(needle), + target_col=cudf.Series._from_data({None: needle}), tolerance=tolerance, ) elif method is not None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 0678ebfdd81..4cd847d38b8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -35,6 +35,7 @@ is_list_like, is_scalar, ) +from cudf.core._base_index import BaseIndex from cudf.core._compat import PANDAS_LT_300 from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, as_column @@ -67,7 +68,6 @@ Dtype, NotImplementedType, ) - from cudf.core._base_index import BaseIndex doc_reset_index_template = """ @@ -304,6 +304,10 @@ def _from_data( index: BaseIndex | None = None, ): out = super()._from_data(data) + if not (index is None or isinstance(index, BaseIndex)): + raise ValueError( + f"index must be None or a cudf.Index not {type(index).__name__}" + ) out._index = RangeIndex(out._data.nrows) if index is None else index return out @@ -3219,13 +3223,13 @@ def duplicated(self, subset=None, keep="first"): distinct = libcudf.stream_compaction.distinct_indices( columns, keep=keep ) - (result,) = libcudf.copying.scatter( + result = libcudf.copying.scatter( [cudf.Scalar(False, dtype=bool)], distinct, [as_column(True, length=len(self), dtype=bool)], bounds_check=False, - ) - return cudf.Series(result, index=self.index) + )[0] + return cudf.Series._from_data({None: result}, index=self.index) @_performance_tracking def _empty_like(self, keep_index=True) -> Self: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 2788455aebf..c5b85a09f2e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -786,7 +786,11 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( out_index._num_columns, k, - cudf.Series._from_data({None: index._data.columns[k]}), + cudf.Series._from_data( + ColumnAccessor( + {None: index._data.columns[k]}, verify=False + ) + ), ) # determine if we should downcast from a DataFrame to a Series @@ -1925,8 +1929,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): *join_keys, how="inner", ) - (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) - result_series = cudf.Series(result) + result = libcudf.copying.scatter([indices], scatter_map, [result])[0] + result_series = cudf.Series._from_data( + ColumnAccessor({None: result}, verify=False) + ) if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index e7248977b1d..52a55760d4a 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -484,9 +484,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): if len(new_objs) == 1 and not ignore_index: return new_objs[0] else: - return cudf.Series._concat( - objs, axis=axis, index=None if ignore_index else True - ) + return cudf.Series._concat(objs, axis=axis, index=not ignore_index) elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, cudf.Index): @@ -632,7 +630,7 @@ def melt( def _tile(A, reps): series_list = [A] * reps if reps > 0: - return cudf.Series._concat(objs=series_list, index=None) + return cudf.Series._concat(objs=series_list, index=False) else: return cudf.Series([], dtype=A.dtype) @@ -661,7 +659,7 @@ def _tile(A, reps): # Step 3: add values mdata[value_name] = cudf.Series._concat( - objs=[frame[val] for val in value_vars], index=None + objs=[frame[val] for val in value_vars], index=False ) return cudf.DataFrame(mdata) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index aa3671c84b0..333cb667603 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -69,6 +69,8 @@ from cudf.utils.performance_tracking import _performance_tracking if TYPE_CHECKING: + import pyarrow as pa + from cudf._typing import ( ColumnLike, DataFrameOrSeries, @@ -294,8 +296,8 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries: return result try: arg = self._loc_to_iloc(arg) - except (TypeError, KeyError, IndexError, ValueError): - raise KeyError(arg) + except (TypeError, KeyError, IndexError, ValueError) as err: + raise KeyError(arg) from err return self._frame.iloc[arg] @@ -394,8 +396,11 @@ def _loc_to_iloc(self, arg): return _indices_from_labels(self._frame, arg) else: - arg = cudf.core.series.Series(cudf.core.column.as_column(arg)) - if arg.dtype in (bool, np.bool_): + ca = ColumnAccessor( + {None: cudf.core.column.as_column(arg)}, verify=False + ) + arg = cudf.core.series.Series._from_data(ca) + if arg.dtype.kind == "b": return arg else: indices = _indices_from_labels(self._frame, arg) @@ -510,7 +515,39 @@ def from_categorical(cls, categorical, codes=None): col = cudf.core.column.categorical.pandas_categorical_as_column( categorical, codes=codes ) - return Series(data=col) + ca = ColumnAccessor({None: col}, verify=False) + return Series._from_data(ca) + + @classmethod + @_performance_tracking + def from_arrow(cls, array: pa.Array): + """Create from PyArrow Array/ChunkedArray. + + Parameters + ---------- + array : PyArrow Array/ChunkedArray + PyArrow Object which has to be converted. + + Raises + ------ + TypeError for invalid input type. + + Returns + ------- + SingleColumnFrame + + Examples + -------- + >>> import cudf + >>> import pyarrow as pa + >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) + 0 a + 1 b + 2 + dtype: object + """ + ca = ColumnAccessor({None: ColumnBase.from_arrow(array)}, verify=False) + return cls._from_data(ca) @classmethod @_performance_tracking @@ -560,7 +597,8 @@ def from_masked_array(cls, data, mask, null_count=None): dtype: int64 """ col = as_column(data).set_mask(mask) - return cls(data=col) + ca = ColumnAccessor({None: col}, verify=False) + return cls._from_data(ca) @_performance_tracking def __init__( @@ -1535,17 +1573,21 @@ def dtype(self): @classmethod @_performance_tracking - def _concat(cls, objs, axis=0, index=True): + def _concat(cls, objs, axis=0, index: bool = True): # Concatenate index if not provided if index is True: if isinstance(objs[0].index, cudf.MultiIndex): - index = cudf.MultiIndex._concat([o.index for o in objs]) + result_index = cudf.MultiIndex._concat([o.index for o in objs]) else: with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) - index = cudf.core.index.Index._concat( + result_index = cudf.core.index.Index._concat( [o.index for o in objs] ) + elif index is False: + result_index = None + else: + raise ValueError(f"{index=} must be a bool") names = {obj.name for obj in objs} if len(names) == 1: @@ -1598,7 +1640,7 @@ def _concat(cls, objs, axis=0, index=True): col = col._with_type_metadata(objs[0].dtype) return cls._from_data( - ColumnAccessor({name: col}, verify=False), index=index + ColumnAccessor({name: col}, verify=False), index=result_index ) @property # type: ignore @@ -3355,9 +3397,15 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - return Series( - cudf.core.column.numerical.digitize(self._column, bins, right) + ca = ColumnAccessor( + { + self.name: cudf.core.column.numerical.digitize( + self._column, bins, right + ) + }, + verify=False, ) + return Series._from_data(ca) @_performance_tracking def diff(self, periods=1): @@ -5318,10 +5366,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): elif b_col.null_count: null_values = b_col.isnull() else: - return Series(result_col, index=index) + return Series._from_data({None: result_col}, index=index) result_col[null_values] = False if equal_nan is True and a_col.null_count and b_col.null_count: result_col[equal_nulls] = True - return Series(result_col, index=index) + return Series._from_data({None: result_col}, index=index) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index a5ff1223791..84a459ae585 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -113,34 +113,7 @@ def values_host(self) -> numpy.ndarray: # noqa: D102 @classmethod @_performance_tracking def from_arrow(cls, array) -> Self: - """Create from PyArrow Array/ChunkedArray. - - Parameters - ---------- - array : PyArrow Array/ChunkedArray - PyArrow Object which has to be converted. - - Raises - ------ - TypeError for invalid input type. - - Returns - ------- - SingleColumnFrame - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - Index(['a', 'b', None], dtype='object') - >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) - 0 a - 1 b - 2 - dtype: object - """ - return cls(ColumnBase.from_arrow(array)) + raise NotImplementedError @_performance_tracking def to_arrow(self) -> pa.Array: diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index afb3496311b..5a45fc80d1d 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -22,7 +22,9 @@ class TokenizeVocabulary: def __init__(self, vocabulary: "cudf.Series"): self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column) - def tokenize(self, text, delimiter: str = "", default_id: int = -1): + def tokenize( + self, text, delimiter: str = "", default_id: int = -1 + ) -> cudf.Series: """ Parameters ---------- @@ -45,4 +47,4 @@ def tokenize(self, text, delimiter: str = "", default_id: int = -1): text._column, self.vocabulary, delim, default_id ) - return cudf.Series(result) + return cudf.Series._from_data({None: result}) diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py index a11022c1a17..8e7c6975201 100644 --- a/python/cudf/cudf/tests/test_apply_rows.py +++ b/python/cudf/cudf/tests/test_apply_rows.py @@ -27,8 +27,12 @@ def test_dataframe_apply_rows(dtype, has_nulls, pessimistic): gdf_series_expected = gdf_series_a * gdf_series_b else: # optimistically ignore the null masks - a = cudf.Series(column.build_column(gdf_series_a.data, dtype)) - b = cudf.Series(column.build_column(gdf_series_b.data, dtype)) + a = cudf.Series._from_data( + {None: column.build_column(gdf_series_a.data, dtype)} + ) + b = cudf.Series_from_data( + {None: column.build_column(gdf_series_b.data, dtype)} + ) gdf_series_expected = a * b df_expected = cudf.DataFrame( diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index c288155112c..3a91c116dec 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -95,7 +95,7 @@ def test_column_offset_and_size(pandas_input, offset, size): else: assert col.size == (col.data.size / col.dtype.itemsize) - got = cudf.Series(col) + got = cudf.Series._from_data({None: col}) if offset is None: offset = 0 @@ -112,8 +112,8 @@ def test_column_offset_and_size(pandas_input, offset, size): def column_slicing_test(col, offset, size, cast_to_float=False): col_slice = col.slice(offset, offset + size) - series = cudf.Series(col) - sliced_series = cudf.Series(col_slice) + series = cudf.Series._from_data({None: col}) + sliced_series = cudf.Series._from_data({None: col_slice}) if cast_to_float: pd_series = series.astype(float).to_pandas() diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e2ce5c03b70..f73b815a2b5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4264,34 +4264,36 @@ def test_empty_dataframe_describe(): def test_as_column_types(): col = column.as_column(cudf.Series([], dtype="float64")) assert_eq(col.dtype, np.dtype("float64")) - gds = cudf.Series(col) + gds = cudf.Series._from_data({None: col}) pds = pd.Series(pd.Series([], dtype="float64")) assert_eq(pds, gds) col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32") assert_eq(col.dtype, np.dtype("float32")) - gds = cudf.Series(col) + gds = cudf.Series._from_data({None: col}) pds = pd.Series(pd.Series([], dtype="float32")) assert_eq(pds, gds) col = column.as_column(cudf.Series([], dtype="float64"), dtype="str") assert_eq(col.dtype, np.dtype("object")) - gds = cudf.Series(col) + gds = cudf.Series._from_data({None: col}) pds = pd.Series(pd.Series([], dtype="str")) assert_eq(pds, gds) col = column.as_column(cudf.Series([], dtype="float64"), dtype="object") assert_eq(col.dtype, np.dtype("object")) - gds = cudf.Series(col) + gds = cudf.Series._from_data({None: col}) pds = pd.Series(pd.Series([], dtype="object")) assert_eq(pds, gds) pds = pd.Series(np.array([1, 2, 3]), dtype="float32") - gds = cudf.Series(column.as_column(np.array([1, 2, 3]), dtype="float32")) + gds = cudf.Series._from_data( + {None: column.as_column(np.array([1, 2, 3]), dtype="float32")} + ) assert_eq(pds, gds) @@ -4301,24 +4303,30 @@ def test_as_column_types(): assert_eq(pds, gds) pds = pd.Series([], dtype="float64") - gds = cudf.Series(column.as_column(pds)) + gds = cudf.Series._from_data({None: column.as_column(pds)}) assert_eq(pds, gds) pds = pd.Series([1, 2, 4], dtype="int64") - gds = cudf.Series(column.as_column(cudf.Series([1, 2, 4]), dtype="int64")) + gds = cudf.Series._from_data( + {None: column.as_column(cudf.Series([1, 2, 4]), dtype="int64")} + ) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="float32") - gds = cudf.Series( - column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32") + gds = cudf.Series._from_data( + { + None: column.as_column( + cudf.Series([1.2, 18.0, 9.0]), dtype="float32" + ) + } ) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="str") - gds = cudf.Series( - column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str") + gds = cudf.Series._from_data( + {None: column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str")} ) assert_eq(pds, gds) @@ -6521,7 +6529,9 @@ def test_from_pandas_for_series_nan_as_null(nan_as_null): data = [np.nan, 2.0, 3.0] psr = pd.Series(data) - expected = cudf.Series(column.as_column(data, nan_as_null=nan_as_null)) + expected = cudf.Series._from_data( + {None: column.as_column(data, nan_as_null=nan_as_null)} + ) got = cudf.from_pandas(psr, nan_as_null=nan_as_null) assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 719e8a33285..68e2cfe0fb7 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -127,7 +127,9 @@ def test_pickle_categorical_column(slices): pickled = pickle.dumps(input_col) out = pickle.loads(pickled) - assert_eq(Series(out), Series(input_col)) + assert_eq( + Series._from_data({None: out}), Series._from_data({None: input_col}) + ) @pytest.mark.parametrize( @@ -148,4 +150,6 @@ def test_pickle_string_column(slices): pickled = pickle.dumps(input_col) out = pickle.loads(pickled) - assert_eq(Series(out), Series(input_col)) + assert_eq( + Series._from_data({None: out}), Series._from_data({None: input_col}) + ) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index d4fe5ff3bb5..1973fe6fb41 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -817,12 +817,12 @@ def test_fillna_string(ps_data, fill_value, inplace): def test_series_fillna_invalid_dtype(data_dtype): gdf = cudf.Series([1, 2, None, 3], dtype=data_dtype) fill_value = 2.5 - with pytest.raises(TypeError) as raises: - gdf.fillna(fill_value) - raises.match( + msg = ( f"Cannot safely cast non-equivalent" f" {type(fill_value).__name__} to {gdf.dtype.type.__name__}" ) + with pytest.raises(TypeError, match=msg): + gdf.fillna(fill_value) @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8ed78d804bf..5075977db0f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2041,7 +2041,7 @@ def test_series_ordered_dedup(): sr = cudf.Series(np.random.randint(0, 100, 1000)) # pandas unique() preserves order expect = pd.Series(sr.to_pandas().unique()) - got = cudf.Series(sr._column.unique()) + got = cudf.Series._from_data({None: sr._column.unique()}) assert_eq(expect.values, got.values) @@ -2697,7 +2697,9 @@ def test_series_duplicate_index_reindex(): def test_list_category_like_maintains_dtype(): dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) data = [1, 2, 3] - result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype)) + result = cudf.Series._from_data( + {None: cudf.core.column.as_column(data, dtype=dtype)} + ) expected = pd.Series(data, dtype=dtype.to_pandas()) assert_eq(result, expected) @@ -2705,7 +2707,9 @@ def test_list_category_like_maintains_dtype(): def test_list_interval_like_maintains_dtype(): dtype = cudf.IntervalDtype(subtype=np.int8) data = [pd.Interval(1, 2)] - result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype)) + result = cudf.Series._from_data( + {None: cudf.core.column.as_column(data, dtype=dtype)} + ) expected = pd.Series(data, dtype=dtype.to_pandas()) assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 69122cdbafa..4c6da6cf1f8 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -178,13 +178,19 @@ def test_column_set_equal_length_object_by_mask(): bool_col = cudf.Series([True, True, True, True, True])._column data[bool_col] = replace_data - assert_eq(cudf.Series(data), cudf.Series(replace_data)) + assert_eq( + cudf.Series._from_data({None: data}), + cudf.Series._from_data({None: replace_data}), + ) data = cudf.Series([0, 0, 1, 1, 1])._column bool_col = cudf.Series([True, False, True, False, True])._column data[bool_col] = replace_data - assert_eq(cudf.Series(data), cudf.Series([100, 0, 300, 1, 500])) + assert_eq( + cudf.Series._from_data({None: data}), + cudf.Series([100, 0, 300, 1, 500]), + ) def test_column_set_unequal_length_object_by_mask(): diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index 4432d2afc8e..94b5820032a 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -96,7 +96,7 @@ def run_udf_test(data, func, dtype): else: result = output - got = cudf.Series(result, dtype=dtype) + got = cudf.Series._from_data({None: result.astype(dtype)}) assert_eq(expect, got, check_dtype=False) with _CUDFNumbaConfig(): udf_str_kernel.forall(len(data))(str_views, output) @@ -105,7 +105,7 @@ def run_udf_test(data, func, dtype): else: result = output - got = cudf.Series(result, dtype=dtype) + got = cudf.Series._from_data({None: result.astype(dtype)}) assert_eq(expect, got, check_dtype=False) From cbb8e54f34f3ed023d3110047c7b0032af502006 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 19:34:19 -0700 Subject: [PATCH 3/8] Create _from_column --- python/cudf/cudf/core/byte_pair_encoding.py | 2 +- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/methods.py | 9 ++-- python/cudf/cudf/core/column/string.py | 14 +++--- python/cudf/cudf/core/dataframe.py | 51 ++++++++----------- python/cudf/cudf/core/groupby/groupby.py | 12 ++--- python/cudf/cudf/core/index.py | 18 +++++-- python/cudf/cudf/core/indexed_frame.py | 8 +-- python/cudf/cudf/core/multiindex.py | 18 ++----- python/cudf/cudf/core/series.py | 52 +++++++++++--------- python/cudf/cudf/core/single_column_frame.py | 12 +++++ python/cudf/cudf/core/tokenize_vocabulary.py | 2 +- python/cudf/cudf/core/tools/datetimes.py | 8 ++- python/cudf/cudf/core/tools/numeric.py | 6 +-- python/cudf/cudf/datasets.py | 5 +- python/cudf/cudf/io/dlpack.py | 2 +- python/cudf/cudf/tests/test_apply_rows.py | 8 +-- python/cudf/cudf/tests/test_column.py | 6 +-- python/cudf/cudf/tests/test_dataframe.py | 34 ++++++------- python/cudf/cudf/tests/test_list.py | 2 +- python/cudf/cudf/tests/test_pickling.py | 8 +-- python/cudf/cudf/tests/test_series.py | 10 ++-- python/cudf/cudf/tests/test_setitem.py | 6 +-- python/cudf/cudf/tests/test_string.py | 2 +- python/cudf/cudf/tests/test_string_udfs.py | 4 +- 25 files changed, 146 insertions(+), 157 deletions(-) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index da9366e5b39..6ca64a0a2be 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -56,4 +56,4 @@ def __call__(self, text, separator: str = " ") -> cudf.Series: sep = cudf.Scalar(separator, dtype="str") result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - return cudf.Series._from_data({None: result}) + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f8088770be2..453495231a1 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -132,7 +132,7 @@ def codes(self) -> cudf.Series: if isinstance(self._parent, cudf.Series) else None ) - return cudf.Series._from_data({None: self._column.codes}, index=index) + return cudf.Series._from_column(self._column.codes, index=index) @property def ordered(self) -> bool: @@ -939,7 +939,7 @@ def find_and_replace( # If a category is being replaced by an existing one, we # want to map it to None. If it's totally new, we want to # map it to the new label it is to be replaced by - dtype_replace = cudf.Series._from_data({None: replacement_col}) + dtype_replace = cudf.Series._from_column(replacement_col) dtype_replace[dtype_replace.isin(cats_col)] = None new_cats_col = cats_col.find_and_replace( to_replace_col, dtype_replace._column diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 5abaef207a4..8c46d238057 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -86,11 +86,10 @@ def _return_or_inplace( data=table, index=self._parent.index ) elif isinstance(self._parent, cudf.Series): - ca = cudf.core.column_accessor.ColumnAccessor( - {self._parent.name: new_col}, verify=False - ) - return cudf.Series._from_data( - ca, self._parent.index if retain_index else None + return cudf.Series._from_column( + new_col, + name=self._parent.name, + index=self._parent.index if retain_index else None, ) elif isinstance(self._parent, cudf.BaseIndex): return cudf.Index(new_col, name=self._parent.name) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9f1313e1569..14431230a3f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -3679,12 +3679,9 @@ def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: f"got: {patterns_column.dtype}" ) - return cudf.Series._from_data( - { - self._parent.name: libstrings.find_multiple( - self._column, patterns_column - ) - }, + return cudf.Series._from_column( + libstrings.find_multiple(self._column, patterns_column), + name=self._parent.name, index=self._parent.index if isinstance(self._parent, cudf.Series) else self._parent, @@ -4692,8 +4689,9 @@ def character_tokenize(self) -> SeriesOrIndex: if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) - data = {self._parent.name: result_col} - return cudf.Series._from_data(data, index=index) + return cudf.Series._from_column( + result_col, name=self._parent.name, index=index + ) elif isinstance(self._parent, cudf.BaseIndex): return cudf.Index(result_col, name=self._parent.name) else: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 211e7520629..f29aa69b59f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1489,14 +1489,14 @@ def __delitem__(self, name): self._drop_column(name) @_performance_tracking - def memory_usage(self, index=True, deep=False): + def memory_usage(self, index=True, deep=False) -> cudf.Series: mem_usage = [col.memory_usage for col in self._data.columns] names = [str(name) for name in self._data.names] if index: mem_usage.append(self.index.memory_usage()) names.append("Index") - return Series._from_data( - data={None: as_column(mem_usage)}, + return Series._from_column( + as_column(mem_usage), index=cudf.Index(names), ) @@ -3804,8 +3804,9 @@ def agg(self, aggs, axis=None): col_empty = column_empty( len(idxs), dtype=col.dtype, masked=True ) - ca = ColumnAccessor({None: col_empty}, verify=False) - ans = cudf.Series._from_data(ca, index=cudf.Index(idxs)) + ans = cudf.Series._from_column( + col_empty, index=cudf.Index(idxs) + ) if isinstance(aggs.get(key), abc.Iterable): # TODO : Allow simultaneous pass for multi-aggregation # as a future optimization @@ -4803,7 +4804,7 @@ def _func(x): # pragma: no cover # this could be written as a single kernel result = {} for name, col in self._data.items(): - apply_sr = Series._from_data({None: col}) + apply_sr = Series._from_column(col) result[name] = apply_sr.apply(_func)._column return DataFrame._from_data(result, index=self.index) @@ -6074,8 +6075,9 @@ def quantile( if q_is_number: result = result.transpose() - ca = ColumnAccessor({q: result._columns[0]}, verify=False) - return Series._from_data(ca, index=result.index) + return Series._from_column( + result._columns[0], name=q, index=result.index + ) else: # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" @@ -6336,12 +6338,8 @@ def count(self, axis=0, numeric_only=False): if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") length = len(self) - return Series._from_data( - { - None: as_column( - [length - col.null_count for col in self._columns] - ) - }, + return Series._from_column( + as_column([length - col.null_count for col in self._columns]), cudf.Index(self._data.names), ) @@ -6470,7 +6468,7 @@ def _reduce( ) else: idx = cudf.Index(source._data.names) - return Series._from_data({None: as_column(result)}, idx) + return Series._from_column(as_column(result), index=idx) elif axis == 1: return source._apply_cupy_method_axis_1(op, **kwargs) else: @@ -6700,8 +6698,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result = result.set_mask( cudf._lib.transform.bools_to_mask(mask._column) ) - ca = ColumnAccessor({None: result}, verify=False) - return Series._from_data(ca, index=self.index) + return Series._from_column(result, index=self.index) else: result_df = DataFrame(result).set_index(self.index) result_df._set_columns_like(prepared._data) @@ -7289,9 +7286,7 @@ def unnamed_group_generator(): # Construct the resulting dataframe / series if not has_unnamed_levels: - result = Series._from_data( - data={None: stacked[0]}, index=new_index - ) + result = Series._from_column(stacked[0], index=new_index) else: if unnamed_level_values.nlevels == 1: unnamed_level_values = unnamed_level_values.get_level_values(0) @@ -7432,10 +7427,8 @@ def to_struct(self, name=None): size=len(self), offset=0, ) - return cudf.Series._from_data( - cudf.core.column_accessor.ColumnAccessor( - {name: col}, verify=False - ), + return cudf.Series._from_column( + col, index=self.index, name=name, ) @@ -7922,12 +7915,10 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): raise ValueError( "Cannot operate inplace if there is no assignment" ) - return Series._from_data( - { - None: libcudf.transform.compute_column( - [*self._columns], self._column_names, statements[0] - ) - } + return Series._from_column( + libcudf.transform.compute_column( + [*self._columns], self._column_names, statements[0] + ) ) targets = [] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f1b31b8335c..644af6ccec6 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -458,13 +458,11 @@ def size(self): """ Return the size of each group. """ - data = { - None: cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ) - } + col = cudf.core.column.column_empty( + len(self.obj), "int8", masked=False + ) return ( - cudf.Series._from_data(data) + cudf.Series._from_column(col) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) .agg("size") ) @@ -1070,7 +1068,7 @@ def ngroup(self, ascending=True): # Count descending from num_groups - 1 to 0 groups = range(num_groups - 1, -1, -1) - group_ids = cudf.Series._from_data({None: as_column(groups)}) + group_ids = cudf.Series._from_column(as_column(groups)) if has_null_group: group_ids.iloc[-1] = cudf.NA diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 404c100ab76..e8c63dda65d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -60,7 +60,7 @@ from cudf.utils.utils import _warn_no_dask_cudf, search_range if TYPE_CHECKING: - from collections.abc import Generator, Iterable + from collections.abc import Generator, Hashable, Iterable def ensure_index(index_like: Any) -> BaseIndex: @@ -1070,6 +1070,16 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return NotImplemented + @classmethod + @_performance_tracking + def _from_column( + cls, column: ColumnBase, *, name: Hashable = None + ) -> Self: + ca = cudf.core.column_accessor.ColumnAccessor( + {name: column}, verify=False + ) + return _index_from_data(ca) + @classmethod @_performance_tracking def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: @@ -1319,21 +1329,21 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") result = libcudf.copying.scatter([indices], scatter_map, [result])[0] - result_series = cudf.Series._from_data({None: result}) + result_series = cudf.Series._from_column(result) if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( index=self, positions=result_series, method=method, - target_col=cudf.Series._from_data({None: needle}), + target_col=cudf.Series._from_column(needle), tolerance=tolerance, ) elif method == "nearest": result_series = _get_nearest_indexer( index=self, positions=result_series, - target_col=cudf.Series._from_data({None: needle}), + target_col=cudf.Series._from_column(needle), tolerance=tolerance, ) elif method is not None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4cd847d38b8..24d947a574a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2938,8 +2938,8 @@ def hash_values(self, method="murmur3", seed=None): # Note that both Series and DataFrame return Series objects from this # calculation, necessitating the unfortunate circular reference to the # child class here. - return cudf.Series._from_data( - {None: libcudf.hash.hash([*self._columns], method, seed)}, + return cudf.Series._from_column( + libcudf.hash.hash([*self._columns], method, seed), index=self.index, ) @@ -3229,7 +3229,7 @@ def duplicated(self, subset=None, keep="first"): [as_column(True, length=len(self), dtype=bool)], bounds_check=False, )[0] - return cudf.Series._from_data({None: result}, index=self.index) + return cudf.Series._from_column(result, index=self.index) @_performance_tracking def _empty_like(self, keep_index=True) -> Self: @@ -3510,7 +3510,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): col = _post_process_output_col(ans_col, retty) col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) - result = cudf.Series._from_data({None: col}, self.index) + result = cudf.Series._from_column(col, index=self.index) return result diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c5b85a09f2e..33330fe92cb 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -702,12 +702,8 @@ def _compute_validity_mask(self, index, row_tuple, max_length): data_table = cudf.concat( [ frame, - cudf.DataFrame( - { - "idx": cudf.Series( - column.as_column(range(len(frame))) - ) - } + cudf.DataFrame._from_data( + {"idx": column.as_column(range(len(frame)))} ), ], axis=1, @@ -786,11 +782,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( out_index._num_columns, k, - cudf.Series._from_data( - ColumnAccessor( - {None: index._data.columns[k]}, verify=False - ) - ), + cudf.Series._from_column(index._data.columns[k]), ) # determine if we should downcast from a DataFrame to a Series @@ -1930,9 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): how="inner", ) result = libcudf.copying.scatter([indices], scatter_map, [result])[0] - result_series = cudf.Series._from_data( - ColumnAccessor({None: result}, verify=False) - ) + result_series = cudf.Series._from_column(result) if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 333cb667603..92fb537a577 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -396,10 +396,9 @@ def _loc_to_iloc(self, arg): return _indices_from_labels(self._frame, arg) else: - ca = ColumnAccessor( - {None: cudf.core.column.as_column(arg)}, verify=False + arg = cudf.core.series.Series._from_column( + cudf.core.column.as_column(arg) ) - arg = cudf.core.series.Series._from_data(ca) if arg.dtype.kind == "b": return arg else: @@ -515,8 +514,7 @@ def from_categorical(cls, categorical, codes=None): col = cudf.core.column.categorical.pandas_categorical_as_column( categorical, codes=codes ) - ca = ColumnAccessor({None: col}, verify=False) - return Series._from_data(ca) + return Series._from_column(col) @classmethod @_performance_tracking @@ -694,6 +692,18 @@ def __init__( self._index = second_index self._check_data_index_length_match() + @classmethod + @_performance_tracking + def _from_column( + cls, + column: ColumnBase, + *, + name: abc.Hashable = None, + index: BaseIndex | None = None, + ) -> Self: + ca = ColumnAccessor({name: column}, verify=False) + return cls._from_data(ca, index=index) + @classmethod @_performance_tracking def _from_data( @@ -2753,8 +2763,8 @@ def mode(self, dropna=True): if len(val_counts) > 0: val_counts = val_counts[val_counts == val_counts.iloc[0]] - return Series._from_data( - {self.name: val_counts.index.sort_values()._column}, name=self.name + return Series._from_column( + val_counts.index.sort_values()._column, name=self.name ) @_performance_tracking @@ -3043,8 +3053,8 @@ def isin(self, values): f"to isin(), you passed a [{type(values).__name__}]" ) - return Series._from_data( - {self.name: self._column.isin(values)}, index=self.index + return Series._from_column( + self._column.isin(values), name=self.name, index=self.index ) @_performance_tracking @@ -3080,9 +3090,7 @@ def unique(self): res = self._column.unique() if cudf.get_option("mode.pandas_compatible"): return res.values - return Series._from_data( - self._data._from_columns_like_self([res], verify=False) - ) + return Series._from_column(res, name=self.name) @_performance_tracking def value_counts( @@ -3314,8 +3322,9 @@ def quantile( if return_scalar: return result - return Series._from_data( - data={self.name: result}, + return Series._from_column( + result, + name=self.name, index=cudf.Index(np_array_q) if quant_index else None, ) @@ -3397,15 +3406,10 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - ca = ColumnAccessor( - { - self.name: cudf.core.column.numerical.digitize( - self._column, bins, right - ) - }, - verify=False, + return Series._from_column( + cudf.core.column.numerical.digitize(self._column, bins, right), + name=self.name, ) - return Series._from_data(ca) @_performance_tracking def diff(self, periods=1): @@ -5366,10 +5370,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): elif b_col.null_count: null_values = b_col.isnull() else: - return Series._from_data({None: result_col}, index=index) + return Series._from_column(result_col, index=index) result_col[null_values] = False if equal_nan is True and a_col.null_count and b_col.null_count: result_col[equal_nulls] = True - return Series._from_data({None: result_col}, index=index) + return Series._from_column(result_col, index=index) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 84a459ae585..eb6714029cf 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -15,11 +15,14 @@ is_numeric_dtype, ) from cudf.core.column import ColumnBase, as_column +from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable if TYPE_CHECKING: + from collections.abc import Hashable + import cupy import numpy import pyarrow as pa @@ -110,6 +113,15 @@ def values(self) -> cupy.ndarray: # noqa: D102 def values_host(self) -> numpy.ndarray: # noqa: D102 return self._column.values_host + @classmethod + @_performance_tracking + def _from_column( + cls, column: ColumnBase, *, name: Hashable = None + ) -> Self: + """Constructor for a single Column.""" + ca = ColumnAccessor({name: column}, verify=False) + return cls._from_data(ca) + @classmethod @_performance_tracking def from_arrow(cls, array) -> Self: diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 5a45fc80d1d..99d85c0c5c0 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -47,4 +47,4 @@ def tokenize( text._column, self.vocabulary, delim, default_id ) - return cudf.Series._from_data({None: result}) + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 2247837bbcb..2f77778116f 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -277,8 +277,7 @@ def to_datetime( format=format, utc=utc, ) - ca = ColumnAccessor({None: col}, verify=False) - return cudf.Series._from_data(ca, index=arg.index) + return cudf.Series._from_column(col, index=arg.index) else: col = _process_col( col=column.as_column(arg), @@ -292,9 +291,8 @@ def to_datetime( ca = ColumnAccessor({arg.name: col}, verify=False) return cudf.DatetimeIndex._from_data(ca) elif isinstance(arg, (cudf.Series, pd.Series)): - ca = ColumnAccessor({arg.name: col}, verify=False) - return cudf.Series._from_data( - ca, index=ensure_index(arg.index) + return cudf.Series._from_column( + col, name=arg.name, index=ensure_index(arg.index) ) elif is_scalar(arg): return col.element_indexing(0) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index ef6955be643..8b95f6f6a04 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -12,7 +12,6 @@ from cudf._lib import strings as libstrings from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core.column import as_column -from cudf.core.column_accessor import ColumnAccessor from cudf.core.dtypes import CategoricalDtype from cudf.core.index import ensure_index from cudf.utils.dtypes import can_convert_to_column @@ -171,8 +170,9 @@ def to_numeric(arg, errors="raise", downcast=None): break if isinstance(arg, (cudf.Series, pd.Series)): - ca = ColumnAccessor({arg.name: col}, verify=False) - return cudf.Series._from_data(ca, index=ensure_index(arg.index)) + return cudf.Series._from_column( + col, name=arg.name, index=ensure_index(arg.index) + ) else: if col.has_nulls(): # To match pandas, always return a floating type filled with nan. diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index 7b183d5f1a3..dbabaacf6b5 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -5,7 +5,6 @@ import cudf from cudf._lib.transform import bools_to_mask -from cudf.core.column_accessor import ColumnAccessor __all__ = ["timeseries", "randomdata"] @@ -73,9 +72,7 @@ def timeseries( ) mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) masked_col = gdf[col]._column.set_mask(mask_buf) - gdf[col] = cudf.Series._from_data( - ColumnAccessor({None: masked_col}), index=gdf.index - ) + gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index) return gdf diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index d3d99aab0cd..1347b2cc38f 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -71,7 +71,7 @@ def to_dlpack(cudf_obj): if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)): gdf = cudf_obj elif isinstance(cudf_obj, ColumnBase): - gdf = cudf.Series._from_data({None: cudf_obj}) + gdf = cudf.Series._from_column(cudf_obj) else: raise TypeError( f"Input of type {type(cudf_obj)} cannot be converted " diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py index 8e7c6975201..f9b0d9c1e78 100644 --- a/python/cudf/cudf/tests/test_apply_rows.py +++ b/python/cudf/cudf/tests/test_apply_rows.py @@ -27,11 +27,11 @@ def test_dataframe_apply_rows(dtype, has_nulls, pessimistic): gdf_series_expected = gdf_series_a * gdf_series_b else: # optimistically ignore the null masks - a = cudf.Series._from_data( - {None: column.build_column(gdf_series_a.data, dtype)} + a = cudf.Series._from_column( + column.build_column(gdf_series_a.data, dtype) ) - b = cudf.Series_from_data( - {None: column.build_column(gdf_series_b.data, dtype)} + b = cudf.Series._from_column( + column.build_column(gdf_series_b.data, dtype) ) gdf_series_expected = a * b diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 3a91c116dec..ca6cea3e778 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -95,7 +95,7 @@ def test_column_offset_and_size(pandas_input, offset, size): else: assert col.size == (col.data.size / col.dtype.itemsize) - got = cudf.Series._from_data({None: col}) + got = cudf.Series._from_column(col) if offset is None: offset = 0 @@ -112,8 +112,8 @@ def test_column_offset_and_size(pandas_input, offset, size): def column_slicing_test(col, offset, size, cast_to_float=False): col_slice = col.slice(offset, offset + size) - series = cudf.Series._from_data({None: col}) - sliced_series = cudf.Series._from_data({None: col_slice}) + series = cudf.Series._from_column(col) + sliced_series = cudf.Series._from_column(col_slice) if cast_to_float: pd_series = series.astype(float).to_pandas() diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f73b815a2b5..2c59253d500 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4264,35 +4264,35 @@ def test_empty_dataframe_describe(): def test_as_column_types(): col = column.as_column(cudf.Series([], dtype="float64")) assert_eq(col.dtype, np.dtype("float64")) - gds = cudf.Series._from_data({None: col}) + gds = cudf.Series._from_column(col) pds = pd.Series(pd.Series([], dtype="float64")) assert_eq(pds, gds) col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32") assert_eq(col.dtype, np.dtype("float32")) - gds = cudf.Series._from_data({None: col}) + gds = cudf.Series._from_column(col) pds = pd.Series(pd.Series([], dtype="float32")) assert_eq(pds, gds) col = column.as_column(cudf.Series([], dtype="float64"), dtype="str") assert_eq(col.dtype, np.dtype("object")) - gds = cudf.Series._from_data({None: col}) + gds = cudf.Series._from_column(col) pds = pd.Series(pd.Series([], dtype="str")) assert_eq(pds, gds) col = column.as_column(cudf.Series([], dtype="float64"), dtype="object") assert_eq(col.dtype, np.dtype("object")) - gds = cudf.Series._from_data({None: col}) + gds = cudf.Series._from_column(col) pds = pd.Series(pd.Series([], dtype="object")) assert_eq(pds, gds) pds = pd.Series(np.array([1, 2, 3]), dtype="float32") - gds = cudf.Series._from_data( - {None: column.as_column(np.array([1, 2, 3]), dtype="float32")} + gds = cudf.Series._from_column( + column.as_column(np.array([1, 2, 3]), dtype="float32") ) assert_eq(pds, gds) @@ -4303,30 +4303,26 @@ def test_as_column_types(): assert_eq(pds, gds) pds = pd.Series([], dtype="float64") - gds = cudf.Series._from_data({None: column.as_column(pds)}) + gds = cudf.Series._from_column(column.as_column(pds)) assert_eq(pds, gds) pds = pd.Series([1, 2, 4], dtype="int64") - gds = cudf.Series._from_data( - {None: column.as_column(cudf.Series([1, 2, 4]), dtype="int64")} + gds = cudf.Series._from_column( + column.as_column(cudf.Series([1, 2, 4]), dtype="int64") ) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="float32") - gds = cudf.Series._from_data( - { - None: column.as_column( - cudf.Series([1.2, 18.0, 9.0]), dtype="float32" - ) - } + gds = cudf.Series._from_column( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32") ) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="str") - gds = cudf.Series._from_data( - {None: column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str")} + gds = cudf.Series._from_column( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str") ) assert_eq(pds, gds) @@ -6529,8 +6525,8 @@ def test_from_pandas_for_series_nan_as_null(nan_as_null): data = [np.nan, 2.0, 3.0] psr = pd.Series(data) - expected = cudf.Series._from_data( - {None: column.as_column(data, nan_as_null=nan_as_null)} + expected = cudf.Series._from_column( + column.as_column(data, nan_as_null=nan_as_null) ) got = cudf.from_pandas(psr, nan_as_null=nan_as_null) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 36bcaa66d7d..c4c883ca9f9 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -946,5 +946,5 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage(): null_count=col.null_count, children=(column_empty(0, col.children[0].dtype), empty_inner), ) - ser = cudf.Series._from_data({None: col_empty_offset}) + ser = cudf.Series._from_column(col_empty_offset) assert ser.memory_usage() == 8 diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 68e2cfe0fb7..0f13a9e173a 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -127,9 +127,7 @@ def test_pickle_categorical_column(slices): pickled = pickle.dumps(input_col) out = pickle.loads(pickled) - assert_eq( - Series._from_data({None: out}), Series._from_data({None: input_col}) - ) + assert_eq(Series._from_column(out), Series._from_column(input_col)) @pytest.mark.parametrize( @@ -150,6 +148,4 @@ def test_pickle_string_column(slices): pickled = pickle.dumps(input_col) out = pickle.loads(pickled) - assert_eq( - Series._from_data({None: out}), Series._from_data({None: input_col}) - ) + assert_eq(Series._from_column(out), Series._from_column(input_col)) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 5075977db0f..6a1887afb1f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2041,7 +2041,7 @@ def test_series_ordered_dedup(): sr = cudf.Series(np.random.randint(0, 100, 1000)) # pandas unique() preserves order expect = pd.Series(sr.to_pandas().unique()) - got = cudf.Series._from_data({None: sr._column.unique()}) + got = cudf.Series._from_column(sr._column.unique()) assert_eq(expect.values, got.values) @@ -2697,8 +2697,8 @@ def test_series_duplicate_index_reindex(): def test_list_category_like_maintains_dtype(): dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) data = [1, 2, 3] - result = cudf.Series._from_data( - {None: cudf.core.column.as_column(data, dtype=dtype)} + result = cudf.Series._from_column( + cudf.core.column.as_column(data, dtype=dtype) ) expected = pd.Series(data, dtype=dtype.to_pandas()) assert_eq(result, expected) @@ -2707,8 +2707,8 @@ def test_list_category_like_maintains_dtype(): def test_list_interval_like_maintains_dtype(): dtype = cudf.IntervalDtype(subtype=np.int8) data = [pd.Interval(1, 2)] - result = cudf.Series._from_data( - {None: cudf.core.column.as_column(data, dtype=dtype)} + result = cudf.Series._from_column( + cudf.core.column.as_column(data, dtype=dtype) ) expected = pd.Series(data, dtype=dtype.to_pandas()) assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 4c6da6cf1f8..5406836ba61 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -179,8 +179,8 @@ def test_column_set_equal_length_object_by_mask(): data[bool_col] = replace_data assert_eq( - cudf.Series._from_data({None: data}), - cudf.Series._from_data({None: replace_data}), + cudf.Series._from_column(data), + cudf.Series._from_column(replace_data), ) data = cudf.Series([0, 0, 1, 1, 1])._column @@ -188,7 +188,7 @@ def test_column_set_equal_length_object_by_mask(): data[bool_col] = replace_data assert_eq( - cudf.Series._from_data({None: data}), + cudf.Series._from_column(data), cudf.Series([100, 0, 300, 1, 500]), ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 1acc0bc2041..4bd084a3938 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -2677,7 +2677,7 @@ def test_string_int_to_ipv4(): ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"] ) - got = cudf.Series._from_data({None: gsr._column.int2ip()}) + got = cudf.Series._from_column(gsr._column.int2ip()) assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index 94b5820032a..69876d97aad 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -96,7 +96,7 @@ def run_udf_test(data, func, dtype): else: result = output - got = cudf.Series._from_data({None: result.astype(dtype)}) + got = cudf.Series._from_column(result.astype(dtype)) assert_eq(expect, got, check_dtype=False) with _CUDFNumbaConfig(): udf_str_kernel.forall(len(data))(str_views, output) @@ -105,7 +105,7 @@ def run_udf_test(data, func, dtype): else: result = output - got = cudf.Series._from_data({None: result.astype(dtype)}) + got = cudf.Series._from_column(result.astype(dtype)) assert_eq(expect, got, check_dtype=False) From f91285d2380ef4f305b7a8965c1b39f86bed061e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 31 Jul 2024 14:45:24 -0700 Subject: [PATCH 4/8] Fix more usages --- python/cudf/cudf/core/dataframe.py | 13 +++++--- python/cudf/cudf/core/multiindex.py | 5 ++- python/cudf/cudf/tests/test_column.py | 38 +++++++++++++++------- python/cudf/cudf/tests/test_decimal.py | 10 +++--- python/cudf/cudf/tests/test_df_protocol.py | 6 ++-- 5 files changed, 47 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f29aa69b59f..c9b8b59a9ad 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -382,7 +382,10 @@ def _setitem_tuple_arg(self, key, value): length = len(idx) if idx is not None else 1 value = as_column(value, length=length) - new_col = cudf.Series(value, index=idx) + if isinstance(value, ColumnBase): + new_col = cudf.Series._from_column(value, index=idx) + else: + new_col = cudf.Series(value, index=idx) if len(self._frame.index) != 0: new_col = new_col._align_to_index( self._frame.index, how="right" @@ -506,12 +509,12 @@ def __getitem__(self, arg): # turn any heterogeneous set of columns into a series if # you only ask for one row. new_name = result.index[0] - result = Series._concat( + ser = Series._concat( [result[name] for name in column_names], ) - result.index = cudf.Index(result.keys()) - result.name = new_name - return result + ser.index = ensure_index(result.keys()) + ser.name = new_name + return ser except TypeError: # Couldn't find a common type, Hence: # Raise in pandas compatibility mode, diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 33330fe92cb..9646b34830f 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -848,7 +848,10 @@ def _get_row_major( valid_indices = self._get_valid_indices_by_tuple( df.index, row_tuple, len(df.index) ) - indices = cudf.Series(valid_indices) + if isinstance(valid_indices, column.ColumnBase): + indices = cudf.Series._from_column(valid_indices) + else: + indices = cudf.Series(valid_indices) result = df.take(indices) final = self._index_and_downcast(result, result.index, row_tuple) return final diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index ca6cea3e778..4aa7fb27c9b 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -208,7 +208,9 @@ def test_as_column_scalar_with_nan(nan_as_null, scalar, size): ) got = ( - cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null)) + cudf.Series._from_column( + as_column(scalar, length=size, nan_as_null=nan_as_null) + ) .dropna() .to_numpy() ) @@ -250,12 +252,18 @@ def test_column_chunked_array_creation(): actual_column = cudf.core.column.as_column(chunked_array, dtype="float") expected_column = cudf.core.column.as_column(pyarrow_array, dtype="float") - assert_eq(cudf.Series(actual_column), cudf.Series(expected_column)) + assert_eq( + cudf.Series._from_column(actual_column), + cudf.Series._from_column(expected_column), + ) actual_column = cudf.core.column.as_column(chunked_array) expected_column = cudf.core.column.as_column(pyarrow_array) - assert_eq(cudf.Series(actual_column), cudf.Series(expected_column)) + assert_eq( + cudf.Series._from_column(actual_column), + cudf.Series._from_column(expected_column), + ) @pytest.mark.parametrize( @@ -287,7 +295,7 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): gpu_data_view = gpu_data.view(to_dtype) expect = pd.Series(cpu_data_view, dtype=cpu_data_view.dtype) - got = cudf.Series(gpu_data_view, dtype=gpu_data_view.dtype) + got = cudf.Series._from_column(gpu_data_view).astype(gpu_data_view.dtype) gpu_ptr = gpu_data.data.get_ptr(mode="read") assert gpu_ptr == got._column.data.get_ptr(mode="read") @@ -327,7 +335,7 @@ def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): ], ) def test_column_view_valid_string_to_numeric(data, to_dtype): - expect = cudf.Series(cudf.Series(data)._column.view(to_dtype)) + expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype)) got = cudf.Series(str_host_view(data, to_dtype)) assert_eq(expect, got) @@ -342,7 +350,7 @@ def test_column_view_nulls_widths_even(): sr = cudf.Series(data, dtype="int32") expect = cudf.Series(expect_data, dtype="float32") - got = cudf.Series(sr._column.view("float32")) + got = cudf.Series._from_column(sr._column.view("float32")) assert_eq(expect, got) @@ -354,7 +362,7 @@ def test_column_view_nulls_widths_even(): sr = cudf.Series(data, dtype="float64") expect = cudf.Series(expect_data, dtype="int64") - got = cudf.Series(sr._column.view("int64")) + got = cudf.Series._from_column(sr._column.view("int64")) assert_eq(expect, got) @@ -365,7 +373,9 @@ def test_column_view_numeric_slice(slc): sr = cudf.Series(data) expect = cudf.Series(data[slc].view("int64")) - got = cudf.Series(sr._column.slice(slc.start, slc.stop).view("int64")) + got = cudf.Series._from_column( + sr._column.slice(slc.start, slc.stop).view("int64") + ) assert_eq(expect, got) @@ -376,7 +386,7 @@ def test_column_view_numeric_slice(slc): def test_column_view_string_slice(slc): data = ["a", "bcde", "cd", "efg", "h"] - expect = cudf.Series( + expect = cudf.Series._from_column( cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8") ) got = cudf.Series(str_host_view(data[slc], "int8")) @@ -409,7 +419,10 @@ def test_as_column_buffer(data, expected): actual_column = cudf.core.column.as_column( cudf.core.buffer.as_buffer(data), dtype=data.dtype ) - assert_eq(cudf.Series(actual_column), cudf.Series(expected)) + assert_eq( + cudf.Series._from_column(actual_column), + cudf.Series._from_column(expected), + ) @pytest.mark.parametrize( @@ -436,7 +449,10 @@ def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs): pyarrow_data = pa.array(data, **pyarrow_kwargs) cudf_from_pyarrow = as_column(pyarrow_data) expected = as_column(data, **cudf_kwargs) - assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected)) + assert_eq( + cudf.Series._from_column(cudf_from_pyarrow), + cudf.Series._from_column(expected), + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index 65f739bc74a..b63788d20b7 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -106,7 +106,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype): pa_arr = got.to_arrow().cast( pa.decimal128(to_dtype.precision, to_dtype.scale) ) - expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) + expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) @@ -146,7 +146,7 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): .cast("float64") .cast(pa.decimal128(to_dtype.precision, to_dtype.scale)) ) - expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) + expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) got = got.astype(to_dtype) @@ -206,9 +206,9 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype): pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False ) if isinstance(to_dtype, Decimal32Dtype): - expected = cudf.Series(Decimal32Column.from_arrow(pa_arr)) + expected = cudf.Series._from_column(Decimal32Column.from_arrow(pa_arr)) elif isinstance(to_dtype, Decimal64Dtype): - expected = cudf.Series(Decimal64Column.from_arrow(pa_arr)) + expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning): got = s.astype(to_dtype) @@ -245,7 +245,7 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype): pa_arr = got.to_arrow().cast(to_dtype, safe=False) got = got.astype(to_dtype) - expected = cudf.Series(NumericalColumn.from_arrow(pa_arr)) + expected = cudf.Series._from_column(NumericalColumn.from_arrow(pa_arr)) assert_eq(got, expected) assert_eq(got.dtype, expected.dtype) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 7f48e414180..44270d20d59 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -78,7 +78,7 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol): # FIXME: In gh-10202 some minimal fixes were added to unblock CI. But # currently only non-null values are compared, null positions are # unchecked. - non_null_idxs = ~cudf.Series(cudfcol).isna() + non_null_idxs = cudfcol.notnull() assert_eq( col_from_buf.apply_boolean_mask(non_null_idxs), cudfcol.apply_boolean_mask(non_null_idxs), @@ -86,8 +86,8 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol): array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get() col_array = cp.asarray(cudfcol.data_array_view(mode="read")).get() assert_eq( - array_from_dlpack[non_null_idxs.to_numpy()].flatten(), - col_array[non_null_idxs.to_numpy()].flatten(), + array_from_dlpack[non_null_idxs.values_host].flatten(), + col_array[non_null_idxs.values_host].flatten(), ) From 1f240c3bbf62765416734893e975de39812ad325 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 31 Jul 2024 19:46:06 -0700 Subject: [PATCH 5/8] Fix failures --- python/cudf/cudf/core/column/categorical.py | 8 +--- python/cudf/cudf/core/dataframe.py | 47 ++++++++++++--------- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/index.py | 4 +- python/cudf/cudf/core/series.py | 3 +- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 453495231a1..45bc5e6937f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1265,12 +1265,8 @@ def _categories_equal( return False # if order doesn't matter, sort before the equals call below if not ordered: - cur_categories = cudf.Series(cur_categories).sort_values( - ignore_index=True - ) - new_categories = cudf.Series(new_categories).sort_values( - ignore_index=True - ) + cur_categories = cur_categories.sort_values() + new_categories = new_categories.sort_values() return cur_categories.equals(new_categories) def _set_categories( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c9b8b59a9ad..3ea30374df7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -503,28 +503,33 @@ def __getitem__(self, arg): return frame._slice(row_spec.key) elif isinstance(row_spec, indexing_utils.ScalarIndexer): result = frame._gather(row_spec.key, keep_index=True) + new_name = result.index[0] + new_index = ensure_index(result.keys()) # Attempt to turn into series. - try: - # Behaviour difference from pandas, which will merrily - # turn any heterogeneous set of columns into a series if - # you only ask for one row. - new_name = result.index[0] - ser = Series._concat( - [result[name] for name in column_names], - ) - ser.index = ensure_index(result.keys()) - ser.name = new_name - return ser - except TypeError: - # Couldn't find a common type, Hence: - # Raise in pandas compatibility mode, - # or just return a 1xN dataframe otherwise - if cudf.get_option("mode.pandas_compatible"): - raise TypeError( - "All columns need to be of same type, please " - "typecast to common dtype." + if len(column_names) == 0: + return Series([], index=new_index, name=new_name) + else: + try: + # Behaviour difference from pandas, which will merrily + # turn any heterogeneous set of columns into a series if + # you only ask for one row. + ser = Series._concat( + [result[name] for name in column_names], ) - return result + except TypeError as err: + # Couldn't find a common type, Hence: + # Raise in pandas compatibility mode, + # or just return a 1xN dataframe otherwise + if cudf.get_option("mode.pandas_compatible"): + raise TypeError( + "All columns need to be of same type, please " + "typecast to common dtype." + ) from err + return result + else: + ser.index = new_index + ser.name = new_name + return ser elif isinstance(row_spec, indexing_utils.EmptyIndexer): return frame._empty_like(keep_index=True) assert_never(row_spec) @@ -6343,7 +6348,7 @@ def count(self, axis=0, numeric_only=False): length = len(self) return Series._from_column( as_column([length - col.null_count for col in self._columns]), - cudf.Index(self._data.names), + index=cudf.Index(self._data.names), ) _SUPPORT_AXIS_LOOKUP = { diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 644af6ccec6..92c4b73ceaa 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -483,7 +483,7 @@ def cumcount(self, ascending: bool = True): "ascending is currently not implemented." ) return ( - cudf.Series( + cudf.Series._from_column( cudf.core.column.column_empty( len(self.obj), "int8", masked=False ), diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b5094a6329d..59314903880 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1122,10 +1122,10 @@ def from_arrow(cls, obj): >>> import cudf >>> import pyarrow as pa >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - Index(['a', 'b', None], dtype='object') + Index(['a', 'b', ], dtype='object') """ try: - return cls(ColumnBase.from_arrow(obj)) + return cls._from_column(ColumnBase.from_arrow(obj)) except TypeError: # Try interpreting object as a MultiIndex before failing. return cudf.MultiIndex.from_arrow(obj) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a6cf3c0c99b..de57ac5f290 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -544,8 +544,7 @@ def from_arrow(cls, array: pa.Array): 2 dtype: object """ - ca = ColumnAccessor({None: ColumnBase.from_arrow(array)}, verify=False) - return cls._from_data(ca) + return cls._from_column(ColumnBase.from_arrow(array)) @classmethod @_performance_tracking From 0749f5ad3f816d76a743ffa4805383baa331b28d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Aug 2024 09:26:39 -0700 Subject: [PATCH 6/8] Address dask_cudf usage --- python/dask_cudf/dask_cudf/backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 4bdb5d921ec..a68bd3789ca 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -153,7 +153,7 @@ def _nonempty_series(s, idx=None): idx = _nonempty_index(s.index) data = _get_non_empty_data(s._column) - return cudf.Series(data, name=s.name, index=idx) + return cudf.Series._from_column(data, name=s.name, index=idx) @meta_nonempty.register(cudf.DataFrame) From 7ec4cc251d1545036860e4430fedff07be8ed82c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Aug 2024 10:50:37 -0700 Subject: [PATCH 7/8] More places to use _from_column --- python/dask_cudf/dask_cudf/backends.py | 2 +- python/dask_cudf/dask_cudf/core.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index a68bd3789ca..adf4dd411e8 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -424,7 +424,7 @@ def hash_object_cudf_index(ind, index=None): return ind.to_frame(index=False).hash_values() col = cudf.core.column.as_column(ind) - return cudf.Series(col).hash_values() + return cudf.Series._from_column(col).hash_values() @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index aab56e3a1b0..3181c8d69ec 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -342,7 +342,7 @@ def groupby(self, by=None, **kwargs): def sum_of_squares(x): x = x.astype("f8")._column outcol = libcudf.reduce.reduce("sum_of_squares", x) - return cudf.Series(outcol) + return cudf.Series._from_column(outcol) @_dask_cudf_performance_tracking From 6348a5e964b43e2b5f72fba211042d0d3f2a0bb1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Aug 2024 11:11:12 -0700 Subject: [PATCH 8/8] Fix dask_cudf passing pa_array --- python/dask_cudf/dask_cudf/backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index adf4dd411e8..2b1f745fc04 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -102,6 +102,7 @@ def _nest_list_data(data, leaf_type): @_dask_cudf_performance_tracking def _get_non_empty_data(s): + """Return a non empty column as metadata.""" if isinstance(s, cudf.core.column.CategoricalColumn): categories = ( s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] @@ -128,7 +129,7 @@ def _get_non_empty_data(s): data = [{key: None for key in struct_dtype.fields.keys()}] * 2 data = cudf.core.column.as_column(data, dtype=s.dtype) elif is_string_dtype(s.dtype): - data = pa.array(["cat", "dog"]) + data = cudf.core.column.as_column(pa.array(["cat", "dog"])) elif isinstance(s.dtype, pd.DatetimeTZDtype): from cudf.utils.dtypes import get_time_unit