diff --git a/.gitignore b/.gitignore index c4c2742..e211849 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +# emacs .dir-locals.el # setuptools_scm diff --git a/src/awkward_pandas/__init__.py b/src/awkward_pandas/__init__.py index 137c934..8613a25 100644 --- a/src/awkward_pandas/__init__.py +++ b/src/awkward_pandas/__init__.py @@ -1,18 +1,11 @@ from __future__ import annotations -import awkward_pandas.accessor -import awkward_pandas.dask_connect # noqa -from awkward_pandas.array import AwkwardExtensionArray -from awkward_pandas.dtype import AwkwardDtype -from awkward_pandas.io import from_awkward, read_json, read_parquet -from awkward_pandas.lib import merge +import awkward_pandas.dask +import awkward_pandas.pandas # noqa +from awkward_pandas.io import read_json, read_parquet from awkward_pandas.version import version as __version__ # noqa __all__ = ( - "AwkwardDtype", - "AwkwardExtensionArray", - "from_awkward", - "merge", "read_parquet", "read_json", ) diff --git a/src/awkward_pandas/accessor.py b/src/awkward_pandas/accessor.py deleted file mode 100644 index 2b068ea..0000000 --- a/src/awkward_pandas/accessor.py +++ /dev/null @@ -1,191 +0,0 @@ -from __future__ import annotations - -import functools -import inspect - -import awkward as ak -import pandas as pd - -from awkward_pandas.array import AwkwardExtensionArray -from awkward_pandas.dtype import AwkwardDtype -from awkward_pandas.strings import StringAccessor - -funcs = [n for n in dir(ak) if inspect.isfunction(getattr(ak, n))] - - -@pd.api.extensions.register_series_accessor("ak") -class AwkwardAccessor: - def __init__(self, pandas_obj): - if not self._validate(pandas_obj): - raise AttributeError("ak accessor called on incompatible data") - self._obj = pandas_obj - self._arr = None - - @property - def extarray(self): - if self._arr is None: - if isinstance(self._obj, AwkwardExtensionArray): - self._arr = self._obj - elif isinstance(self._obj.dtype, AwkwardDtype) and isinstance( - self._obj, pd.Series - ): - # this is a pandas Series that contains an Awkward - self._arr = self._obj.values - elif isinstance(self._obj.dtype, AwkwardDtype): - # a dask series - figure out what to do here - raise NotImplementedError - else: - # this recreates series, possibly by iteration - self._arr = AwkwardExtensionArray(self._obj) - return self._arr - - @property - def array(self) -> ak.Array: - """Get underlying awkward array""" - return self.extarray._data - - def __getitem__(self, items): - """Extract components using awkward indexing""" - ds = self.array.__getitem__(items) - index = None - if items[0]: - if not isinstance(items[0], str) and not ( - isinstance(items[0], list) and isinstance(items[0][0], str) - ): - index = self._obj.index[items[0]] - return pd.Series(AwkwardExtensionArray(ds), index=index) - - def to_column(self) -> pd.Series: - """Convert awkward series to regular pandas type - - Will convert to numpy or string[pyarrow] if appropriate. - May fail if the conversion cannot be done. - """ - data = self.array - if data.ndim > 1: - raise ValueError - # TODO: if all_strings(data) - accept ?str - if data.layout.parameter("__array__") == "string": - from pandas.core.arrays.string_arrow import ArrowStringArray - - new_ak_array = ak.to_arrow( - data, - string_to32=True, - extensionarray=False, - ) - return pd.Series(ArrowStringArray(new_ak_array)) - else: - return pd.Series(ak.to_numpy(data)) - - def to_columns( - self, - cull: bool = True, - extract_all: bool = False, - awkward_name: str = "awkward-data", - ) -> pd.DataFrame: - """Extract columns from an awkward series - - Where the series is a record type, each field may become a regular - pandas column. - - Parameters - ---------- - cull: bool - For those columns that we convert into regular ones, remove them - from the original awkward series if True - extract_all: bool - If False (default), only extract columns that can turn into normal - pandas columns. If True, all columns will be extracted, but those - that cannot be converted retain "awkward" type - awkward_name: str - If there are leftover columns in the original series, in the - resultant dataframe, these leftovers will get this column name - - Returns - ------- - pd.DataFrame - """ - s = self._obj - fields = self.array.fields - out = {} - for field in fields: - try: - out[field] = s.ak[field].ak.to_column() - except Exception: - if extract_all: - out[field] = s.ak[field] - if cull and extract_all: - pass - elif cull: - n = s.name or awkward_name - outfields = [_ for _ in fields if _ not in out] - if outfields: - out[n] = s.ak[outfields] - else: - out[s.name] = s - return pd.DataFrame(out) - - @staticmethod - def _validate(obj): - return isinstance( - obj, (AwkwardExtensionArray, ak.Array, ak.Record) - ) or isinstance(obj.values, AwkwardExtensionArray) - - # def to_arrow(self): - # return self.array.to_arrow() - - # def cartesian(self, other, **kwargs): - # if isinstance(other, AwkwardExtensionArray): - # other = other._data - # return AwkwardExtensionArray(ak.cartesian([self.array, other], **kwargs)) - - @property - def str(self) -> StringAccessor: - return StringAccessor(self) - - def __getattr__(self, item): - """Call awkward namespace function on a series""" - # replace with concrete implementations of all top-level ak functions - if item not in dir(self): - raise AttributeError - func = getattr(ak, item, None) - - if func: - - @functools.wraps(func) - def f(*others, **kwargs): - others = [ - other._data - if isinstance(getattr(other, "_data", None), ak.Array) - else other - for other in others - ] - ak_arr = func(self.array, *others, **kwargs) - # TODO: special case to carry over index and name information where output - # is similar to input, e.g., has same length - if isinstance(ak_arr, ak.Array): - # TODO: perhaps special case here if the output can be represented - # as a regular num/cupy array - return pd.Series( - AwkwardExtensionArray(ak_arr), index=self._obj.index - ) - return ak_arr - - else: - raise AttributeError - - return f - - def apply(self, fn): - """Perform function on all the values of the series""" - result = fn(self.array) - if isinstance(result, ak.Array): - return pd.Series(AwkwardExtensionArray(result)) - return result - - def __dir__(self) -> list[str]: - return [ - _ - for _ in (dir(ak)) - if not _.startswith(("_", "ak_")) and not _[0].isupper() - ] + ["to_column"] diff --git a/src/awkward_pandas/array.py b/src/awkward_pandas/array.py deleted file mode 100644 index 08008d0..0000000 --- a/src/awkward_pandas/array.py +++ /dev/null @@ -1,186 +0,0 @@ -from __future__ import annotations - -import operator -from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Literal - -import awkward as ak -import numpy as np -import pandas as pd -from pandas.core.arrays.base import ( - ExtensionArray, - ExtensionScalarOpsMixin, - set_function_name, -) -from pandas.core.dtypes.dtypes import ArrowDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCSeries - -from awkward_pandas.dtype import AwkwardDtype - -if TYPE_CHECKING: - from numpy.typing import DTypeLike, NDArray - - -class AwkwardExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): - _dtype: AwkwardDtype - _data: ak.Array - - def __init__(self, data: Any) -> None: - """ - - Parameters - ---------- - data : awkward array, dict, JSON string, iterable - Construct extension array from this data. If an iterable or dict, - pass to awkward to generate the internal array. If a JSON string, - parse it using awkward. - """ - self._dtype = AwkwardDtype() - if isinstance(data, type(self)): - self._data = data._data - elif isinstance(data, ak.Array): - self._data = data - elif hasattr(data, "dtype") and isinstance(data.dtype, ArrowDtype): - self._data = ak.from_arrow(data._pa_array) - elif isinstance(data, dict): - self._data = ak.Array(data) - elif isinstance(data, str): - self._data = ak.from_json(data) - elif isinstance(data, Iterable): - self._data = ak.from_iter(None if a is pd.NA else a for a in data) - elif data is None: - self._data = ak.Array([]) - else: - raise ValueError - - @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): - return cls(scalars) - - @classmethod - def _empty(cls, shape, dtype): - if isinstance(shape, tuple) and len(shape) != 1: - raise ValueError - if isinstance(shape, tuple): - return cls([None] * shape[0]) - return cls([None] * shape) - - @classmethod - def _from_factorized(cls, values, original): - return cls(values) - - def __getitem__(self, item): - if isinstance(item, int): - return operator.getitem(self._data, item) - elif isinstance(item, (slice, np.ndarray, ak.Array)): - new = operator.getitem(self._data, item) - return type(self)(new) - else: - raise ValueError(f"bad item passed to getitem: {type(item)}") - - def __setitem__(self, key, value): - raise NotImplementedError - - def __len__(self) -> int: - return len(self._data) - - def __iter__(self): - for i in range(len(self)): - yield self._data[i] - - @classmethod - def _create_method(cls, op, coerce_to_dtype=True, result_dtype=None): - def _binop(self, other): - if isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)): - # rely on pandas to unbox and dispatch to us - return NotImplemented - - lvalues = self - if isinstance(other, list) or ( - isinstance(other, pd.Series) and other.dtype == "O" - ): - rvalues = cls(other) - return cls(op(lvalues._data, rvalues._data)) - else: - return cls(op(lvalues._data, other)) - - op_name = f"__{op.__name__}__" - return set_function_name(_binop, op_name, cls) - - def _reduce(self, name: str, *, skipna: bool = True, axis=None, **kwargs): - return getattr(ak, name)(self._data, **kwargs) - - def _explode(self): - nums = ak.num(self._data, axis=1) - nums_filled = ak.fill_none(nums, 0) - data = ak.where(nums_filled == 0, [[None]], self._data) - flat = ak.flatten(data) - arr = type(self)(flat) - return arr, ak.num(data, axis=1) - - @property - def dtype(self) -> AwkwardDtype: - return self._dtype - - @property - def nbytes(self) -> int: - return self._data.layout.nbytes - - def isna(self): - return np.array(ak.is_none(self._data)) - - def take(self, indices, *, allow_fill=False, fill_value=None): - return self[indices] - - def copy(self): - return type(self)(ak.copy(self._data)) - - @classmethod - def _concat_same_type(cls, to_concat): - return cls(ak.concatenate([a._data for a in to_concat])) - - @property - def ndim(self) -> Literal[1]: - return 1 - - @property - def shape(self) -> tuple[int]: - return (len(self._data),) - - def __array__(self, dtype: DTypeLike | None = None) -> NDArray: - dtype = np.dtype(object) if dtype is None else np.dtype(dtype) - - if dtype == np.dtype("O"): - return np.asarray(self._data.tolist(), dtype=dtype) - - return np.asarray(self._data, dtype=dtype) - - def __arrow_array__(self, type=None): - import pyarrow as pa - - return pa.chunked_array(ak.to_arrow(self._data), type=type) - - def tolist(self) -> list: - return self._data.tolist() - - def __array_ufunc__(self, *inputs, **kwargs): - return type(self)(self._data.__array_ufunc__(*inputs, **kwargs)) - - def max(self, **kwargs): - return ak.max(self._data, **kwargs) - - def min(self, **kwargs): - return ak.min(self._data, **kwargs) - - def mean(self, **kwargs): - return ak.mean(self._data, **kwargs) - - def std(self, **kwargs): - return ak.std(self._data, **kwargs) - - def sum(self, axis=None, **kwargs): - return ak.sum(self._data, axis=axis, **kwargs) - - -AwkwardExtensionArray._add_arithmetic_ops() -AwkwardExtensionArray._add_comparison_ops() diff --git a/src/awkward_pandas/dask.py b/src/awkward_pandas/dask.py new file mode 100644 index 0000000..23ad843 --- /dev/null +++ b/src/awkward_pandas/dask.py @@ -0,0 +1,61 @@ +import functools + +import awkward as ak +import dask.dataframe as dd +from dask.dataframe.extensions import ( + register_dataframe_accessor, + register_series_accessor, +) + +from awkward_pandas.mixin import Accessor as AkAccessor +from awkward_pandas.pandas import PandasAwkwardAccessor + + +class DaskAwkwardAccessor(AkAccessor): + series_type = dd.Series + dataframe_type = dd.DataFrame + aggregations = False # you need dask-awkward for that + + @classmethod + def _create_op(cls, op): + def run(self, *args, **kwargs): + orig = self._obj.head() + ar = (ar.head() if hasattr(ar, "ak") else ar for ar in args) + meta = PandasAwkwardAccessor._to_output(op(orig.ak.array, *ar, **kwargs)) + + def inner(data): + import awkward_pandas.pandas # noqa: F401 + + ar2 = (ar.ak.array if hasattr(ar, "ak") else ar for ar in args) + out = op(data.ak.array, *ar2, **kwargs) + return PandasAwkwardAccessor._to_output(out) + + return self._obj.map_partitions(inner, meta=meta) + + return run + + def __getattr__(self, item): + if item not in dir(self): + raise AttributeError + func = getattr(ak, item, None) + + if func: + orig = self._obj.head() + + @functools.wraps(func) + def f(*others, **kwargs): + def func2(data): + import awkward_pandas.pandas # noqa: F401 + + # data and others are pandas objects here + return getattr(data.ak, item)(*others, **kwargs) + + return self._obj.map_partitions(func2, meta=func(orig)) + + else: + raise AttributeError(item) + return f + + +register_series_accessor("ak")(DaskAwkwardAccessor) +register_dataframe_accessor("ak")(DaskAwkwardAccessor) diff --git a/src/awkward_pandas/dask_connect.py b/src/awkward_pandas/dask_connect.py deleted file mode 100644 index 9fc561d..0000000 --- a/src/awkward_pandas/dask_connect.py +++ /dev/null @@ -1,53 +0,0 @@ -import pandas as pd - -from awkward_pandas.dtype import AwkwardDtype - -s = pd.Series(["hello"], dtype="awkward") - - -def register_dtype(): - from dask.dataframe.extensions import make_array_nonempty - - data = [[1], [1, 2, None]] - - @make_array_nonempty.register(AwkwardDtype) - def _(x): - return pd.Series(data, dtype="awkward") - - -try: - from dask_expr._accessor import Accessor -except (ImportError, ModuleNotFoundError): - try: - from dask.dataframe.accessor import Accessor - except (ImportError, ModuleNotFoundError): - Accessor = object - - -class DaskAwkwardAccessor(Accessor): - _accessor_name = "ak" - - _accessor_methods = dir(s.ak) - - _accessor_properties = () - - # TODO: dask-awkward could take over here - for method in _accessor_methods: - - def _(self, *args, method=method, **kwargs): - def __(s, method=method): - return s.ak.__getattr__(method)(*args, **kwargs) - - return self._series.map_partitions(__) - - locals()[method] = _ - - -try: - import dask - from dask.dataframe.extensions import register_series_accessor - - register_dtype() - register_series_accessor("ak")(DaskAwkwardAccessor) -except (ImportError, ModuleNotFoundError): - dask = False diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py new file mode 100644 index 0000000..9f09b2d --- /dev/null +++ b/src/awkward_pandas/datetimes.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import awkward as ak +import pyarrow as pa +import pyarrow.compute as pc + + +def _run_unary(layout, op, kind=None, **kw): + if layout.is_record: + [_run_unary(_, op, kind=kind, **kw) for _ in layout._contents] + elif layout.is_leaf and (kind is None or layout.dtype.kind == kind): + layout._data = ak.str._apply_through_arrow(op, layout, **kw).data + elif layout.is_option or layout.is_list: + _run_unary(layout.content, op, kind=kind, **kw) + + +def run_unary(arr: ak.Array, op, kind=None, **kw) -> ak.Array: + arr2 = ak.copy(arr) + _run_unary(arr2.layout, op, kind=kind, **kw) + return ak.Array(arr2) + + +class DatetimeAccessor: + def __init__(self, accessor) -> None: + self.accessor = accessor + + def cast(self, target_type=None, safe=None, options=None): + """Cast values to given type + + This may be the easiest way to make time types from scratch + + Examples + -------- + + >>> import pandas as pd + >>> import awkward_pandas.pandas + >>> s = pd.Series([[0, 1], [1, 0], [2]]) + >>> s.ak.dt.cast("timestamp[s]") + 0 ['1970-01-01T00:00:00' '1970-01-01T00:00:01'] + 1 ['1970-01-01T00:00:01' '1970-01-01T00:00:00'] + 2 ['1970-01-01T00:00:02'] + dtype: list[pyarrow] + """ + return self.accessor.to_output( + run_unary( + self.accessor.array, + pc.cast, + target_type=target_type, + safe=safe, + options=options, + ) + ) + + def ceil_temporal( + self, + /, + multiple=1, + unit="day", + *, + week_starts_monday=True, + ceil_is_strictly_greater=False, + calendar_based_origin=False, + options=None, + ): + raise NotImplementedError("TODO") + + def floor_temporal( + self, + /, + multiple=1, + unit="day", + *, + week_starts_monday=True, + ceil_is_strictly_greater=False, + calendar_based_origin=False, + options=None, + ): + raise NotImplementedError("TODO") + + def round_temporal( + self, + /, + multiple=1, + unit="day", + *, + week_starts_monday=True, + ceil_is_strictly_greater=False, + calendar_based_origin=False, + options=None, + ): + raise NotImplementedError("TODO") + + def run_end_decode(self, array): + raise NotImplementedError("TODO") + + def run_end_encode( + self, + /, + run_end_type=pa.int32(), + *, + options=None, + ): + raise NotImplementedError("TODO") + + def strftime( + self, + /, + format="%Y-%m-%dT%H:%M:%S", + locale="C", + *, + options=None, + ): + raise NotImplementedError("TODO") + + def strptime( + self, + /, + format, + unit, + error_is_null=False, + *, + options=None, + ): + raise NotImplementedError("TODO") + + def day(self): + raise NotImplementedError("TODO") + + def day_of_week( + self, + /, + *, + count_from_zero=True, + week_start=1, + options=None, + ): + raise NotImplementedError("TODO") + + def day_of_year(self): + raise NotImplementedError("TODO") + + def hour(self): + raise NotImplementedError("TODO") + + def iso_week(self): + raise NotImplementedError("TODO") + + def iso_year(self): + raise NotImplementedError("TODO") + + def iso_calendar(self): + raise NotImplementedError("TODO") + + def is_leap_year(self): + raise NotImplementedError("TODO") + + def microsecond(self): + raise NotImplementedError("TODO") + + def millisecond(self): + raise NotImplementedError("TODO") + + def minute(self): + raise NotImplementedError("TODO") + + def month(self): + raise NotImplementedError("TODO") + + def nanosecond(self): + raise NotImplementedError("TODO") + + def quarter(self): + raise NotImplementedError("TODO") + + def second(self): + raise NotImplementedError("TODO") + + def subsecond(self): + raise NotImplementedError("TODO") + + def us_week(self): + raise NotImplementedError("TODO") + + def us_year(self): + raise NotImplementedError("TODO") + + def week( + self, + /, + *, + week_starts_monday=True, + count_from_zero=False, + first_week_is_fully_in_year=False, + options=None, + ): + raise NotImplementedError("TODO") + + def year(self): + raise NotImplementedError("TODO") + + def year_month_day(self): + raise NotImplementedError("TODO") + + def day_time_interval_between(self, end): + raise NotImplementedError("TODO") + + def days_between(self, end): + raise NotImplementedError("TODO") + + def hours_between(self, end): + raise NotImplementedError("TODO") + + def microseconds_between(self, end): + raise NotImplementedError("TODO") + + def milliseconds_between(self, end): + raise NotImplementedError("TODO") + + def minutes_between(self, end): + raise NotImplementedError("TODO") + + def month_day_nano_interval_between(self, end): + raise NotImplementedError("TODO") + + def month_interval_between(self, end): + raise NotImplementedError("TODO") + + def nanoseconds_between(self, end): + return self.accessor.to_output( + pc.nanoseconds_between(self.accessor.arrow, end.ak.arrow), + ) + + def quarters_between(self, end): + raise NotImplementedError("TODO") + + def seconds_between(self, end): + return self.accessor.to_output( + pc.seconds_between(self.accessor.arrow, end.ak.arrow) + ) + + def weeks_between( + self, + end, + /, + *, + count_from_zero=True, + week_start=1, + options=None, + ): + raise NotImplementedError("TODO") + + def years_between(self, end): + return self.accessor.to_output( + pc.years_between(self.accessor.arrow, end.ak.arrow) + ) + + +def _to_arrow(array): + array = _make_unit_compatible(array) + return ak.to_arrow(array, extensionarray=False) + + +def _make_unit_compatible(array): + # TODO, actually convert units if not compatible + return array diff --git a/src/awkward_pandas/dtype.py b/src/awkward_pandas/dtype.py deleted file mode 100644 index 5b97a04..0000000 --- a/src/awkward_pandas/dtype.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -import awkward as ak -import numpy as np -from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype - -if TYPE_CHECKING: - from awkward_pandas.array import AwkwardExtensionArray - - -@register_extension_dtype -class AwkwardDtype(ExtensionDtype): - @property - def name(self) -> str: - return "awkward" - - @property - def type(self) -> type[ak.Array]: - return ak.Array - - @property - def kind(self) -> str: - return "O" - - @property - def na_value(self) -> object: - return np.nan - - @property - def _is_numeric(self) -> bool: - return True - - @property - def _is_boolean(self) -> bool: - return True - - @classmethod - def construct_from_string(cls, string: str) -> AwkwardDtype: - """Construct an instance from a string. - - Parameters - ---------- - string : str - Should be "awkward". - - Returns - ------- - AwkwardDtype - Instance of the dtype. - - """ - - if not isinstance(string, str): - raise TypeError( - f"'construct_from_string' expects a string, got {type(string)}" - ) - - if string == cls().name: - return cls() - else: - raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") - - @classmethod - def construct_array_type(cls) -> type[AwkwardExtensionArray]: # type: ignore[valid-type] - from awkward_pandas.array import AwkwardExtensionArray - - return AwkwardExtensionArray - - def __from_arrow__(self, data: Any) -> AwkwardExtensionArray: - from awkward_pandas.array import AwkwardExtensionArray - - return AwkwardExtensionArray(ak.from_arrow(data)) - - def __repr__(self) -> str: - return "" diff --git a/src/awkward_pandas/io.py b/src/awkward_pandas/io.py index 1a4cca3..305e0da 100644 --- a/src/awkward_pandas/io.py +++ b/src/awkward_pandas/io.py @@ -1,70 +1,55 @@ from __future__ import annotations import awkward as ak -import pandas as pd +import fsspec -from awkward_pandas.array import AwkwardExtensionArray +import awkward_pandas.pandas -def from_awkward(array: ak.Array, name: str | None = None) -> pd.Series: - """Wrap an awkward Array in a pandas Series. +def read_parquet( + url: str, + storage_options: dict | None = None, + extract: bool = True, + **kwargs, +): + """Read a Parquet dataset with nested data into a Series or DataFrame. Parameters ---------- - array : ak.Array - Awkward array to wrap. - name : str, optional - Name for the series. - - Returns - ------- - pandas.Series - Resulting Series with dtype AwkwardDtype - - Examples - -------- - >>> import awkward as ak - >>> import awkward_pandas as akpd - >>> a = ak.from_iter([[1, 2, 3], [4, 5], [6]]) - >>> s = akpd.from_awkward(a, name="my-array") - 0 [1, 2, 3] - 1 [4, 5] - 2 [6] - Name: my-array, dtype: awkward - + url: data location + storage_options: any arguments for an fsspec backend + extract: whether to turn top-level records into a dataframe. If False, + will return a series. """ - return pd.Series(AwkwardExtensionArray(array), name=name) - - -def read_parquet( - url, - extract=True, - root_name="awkward", - extract_all=False, - **kwargs, -): - """Read a Parquet dataset with nested data into a Series or DataFrame.""" - ds = ak.from_parquet(url, **kwargs) - s = from_awkward(ds, name=root_name) + ds = ak.from_parquet(url, storage_options=storage_options, **kwargs) + s = awkward_pandas.pandas.PandasAwkwardAccessor._to_output(ds) if extract: - return s.ak.to_columns(cull=True, extract_all=extract_all) + return s.ak.unmerge() return s def read_json( url, + storage_options: dict | None = None, extract=True, - root_name="awkward", - extract_all=False, **kwargs, ): - """Read a JSON dataset with nested data into a Series or DataFrame.""" - ds = ak.from_json( - url, - line_delimited=True, - **kwargs, - ) - s = from_awkward(ds, name=root_name) + """Read a JSON dataset with nested data into a Series or DataFrame. + + Parameters + ---------- + url: data location + storage_options: any arguments for an fsspec backend + extract: whether to turn top-level records into a dataframe. If False, + will return a series. + """ + with fsspec.open(url, **storage_options) as f: + ds = ak.from_json( + f, + line_delimited=True, + **kwargs, + ) + s = awkward_pandas.pandas.PandasAwkwardAccessor._to_output(ds) if extract: - return s.ak.to_columns(cull=True, extract_all=extract_all) + return s.ak.unmerge() return s diff --git a/src/awkward_pandas/lib.py b/src/awkward_pandas/lib.py deleted file mode 100644 index 85ef2ae..0000000 --- a/src/awkward_pandas/lib.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import annotations - -import awkward as ak -import numpy as np -import pandas as pd - -from awkward_pandas.io import from_awkward - - -def merge(dataframe: pd.DataFrame, name: str | None = None) -> pd.Series: - """Create a single awkward series by merging the columns of a dataframe. - - Parameters - ---------- - dataframe: pd.DataFrame - Containing columns of simple numpy type, object type (e.g., - srtings, lists or dicts) or existing awkward columns. - name: str or None - Name of the output series. - - Returns - ------- - pd.Series - Resuling Series with dtype AwkwardDtype - - """ - out = {} - for c in dataframe.columns: - if dataframe[c].dtype == "awkward": - out[c] = dataframe[c].values._data - elif dataframe[c].dtype == "string[pyarrow]": - out[c] = ak.from_arrow(dataframe[c].values._data) - elif dataframe[c].dtype == np.dtype("O"): - out[c] = ak.from_iter(dataframe[c]) - else: - out[c] = dataframe[c].values - return from_awkward(ak.Array(out), name=name) diff --git a/src/awkward_pandas/mixin.py b/src/awkward_pandas/mixin.py index a2168e2..d84434f 100644 --- a/src/awkward_pandas/mixin.py +++ b/src/awkward_pandas/mixin.py @@ -1,4 +1,15 @@ +import functools import operator +from typing import Callable, Iterable + +import awkward as ak + +methods = [ + _ for _ in (dir(ak)) if not _.startswith(("_", "ak_")) and not _[0].isupper() +] + ["apply", "array", "explode", "dt", "str"] + +df_methods = sorted(methods + ["merge"]) +series_methods = sorted(methods + ["unmerge"]) def radd(left, right): @@ -62,56 +73,196 @@ class AbstractMethodError(NotImplementedError): class ArithmeticMixin: @classmethod - def _create_arithmetic_method(cls, op): + def _create_op(cls, op): raise AbstractMethodError(cls) @classmethod - def _create_comparison_method(cls, op): + def _create_op(cls, op): raise AbstractMethodError(cls) @classmethod - def _create_logical_method(cls, op): + def _create_op(cls, op): raise AbstractMethodError(cls) @classmethod def _add_arithmetic_ops(cls) -> None: - setattr(cls, "__add__", cls._create_arithmetic_method(operator.add)) - setattr(cls, "__radd__", cls._create_arithmetic_method(radd)) - setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub)) - setattr(cls, "__rsub__", cls._create_arithmetic_method(rsub)) - setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul)) - setattr(cls, "__rmul__", cls._create_arithmetic_method(rmul)) - setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow)) - setattr(cls, "__rpow__", cls._create_arithmetic_method(rpow)) - setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) - setattr(cls, "__rmod__", cls._create_arithmetic_method(rmod)) - setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) - setattr(cls, "__rfloordiv__", cls._create_arithmetic_method(rfloordiv)) - setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) - setattr(cls, "__rtruediv__", cls._create_arithmetic_method(rtruediv)) - setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) - setattr(cls, "__rdivmod__", cls._create_arithmetic_method(rdivmod)) + setattr(cls, "__add__", cls._create_op(operator.add)) + setattr(cls, "__radd__", cls._create_op(radd)) + setattr(cls, "__sub__", cls._create_op(operator.sub)) + setattr(cls, "__rsub__", cls._create_op(rsub)) + setattr(cls, "__mul__", cls._create_op(operator.mul)) + setattr(cls, "__rmul__", cls._create_op(rmul)) + setattr(cls, "__pow__", cls._create_op(operator.pow)) + setattr(cls, "__rpow__", cls._create_op(rpow)) + setattr(cls, "__mod__", cls._create_op(operator.mod)) + setattr(cls, "__rmod__", cls._create_op(rmod)) + setattr(cls, "__floordiv__", cls._create_op(operator.floordiv)) + setattr(cls, "__rfloordiv__", cls._create_op(rfloordiv)) + setattr(cls, "__truediv__", cls._create_op(operator.truediv)) + setattr(cls, "__rtruediv__", cls._create_op(rtruediv)) + setattr(cls, "__divmod__", cls._create_op(divmod)) + setattr(cls, "__rdivmod__", cls._create_op(rdivmod)) @classmethod def _add_comparison_ops(cls) -> None: - setattr(cls, "__eq__", cls._create_comparison_method(operator.eq)) - setattr(cls, "__ne__", cls._create_comparison_method(operator.ne)) - setattr(cls, "__lt__", cls._create_comparison_method(operator.lt)) - setattr(cls, "__gt__", cls._create_comparison_method(operator.gt)) - setattr(cls, "__le__", cls._create_comparison_method(operator.le)) - setattr(cls, "__ge__", cls._create_comparison_method(operator.ge)) + setattr(cls, "__eq__", cls._create_op(operator.eq)) + setattr(cls, "__ne__", cls._create_op(operator.ne)) + setattr(cls, "__lt__", cls._create_op(operator.lt)) + setattr(cls, "__gt__", cls._create_op(operator.gt)) + setattr(cls, "__le__", cls._create_op(operator.le)) + setattr(cls, "__ge__", cls._create_op(operator.ge)) @classmethod def _add_logical_ops(cls) -> None: - setattr(cls, "__and__", cls._create_logical_method(operator.and_)) - setattr(cls, "__rand__", cls._create_logical_method(rand_)) - setattr(cls, "__or__", cls._create_logical_method(operator.or_)) - setattr(cls, "__ror__", cls._create_logical_method(ror_)) - setattr(cls, "__xor__", cls._create_logical_method(operator.xor)) - setattr(cls, "__rxor__", cls._create_logical_method(rxor)) + setattr(cls, "__and__", cls._create_op(operator.and_)) + setattr(cls, "__rand__", cls._create_op(rand_)) + setattr(cls, "__or__", cls._create_op(operator.or_)) + setattr(cls, "__ror__", cls._create_op(ror_)) + setattr(cls, "__xor__", cls._create_op(operator.xor)) + setattr(cls, "__rxor__", cls._create_op(rxor)) @classmethod def _add_all(cls): cls._add_logical_ops() cls._add_arithmetic_ops() cls._add_comparison_ops() + + +class Accessor(ArithmeticMixin): + """Bring the awkward API to dataframes and series""" + + aggregations = True # False means data is partitioned + series_type = () + dataframe_type = () + + def __init__(self, obj): + self._obj = obj + + @classmethod + def is_series(cls, data): + return isinstance(data, cls.series_type) + + @classmethod + def is_dataframe(cls, data): + return isinstance(data, cls.dataframe_type) + + @classmethod + def _to_output(cls, data): + raise NotImplementedError + + def to_output(self, data): + return self._to_output(data) + + def apply(self, fn: Callable): + """Perform arbitrary function on all the values of the series""" + return self.to_output(fn(self.array)) + + def __getitem__(self, item): + out = self.array.__getitem__(item) + return self.to_output(out) + + def __dir__(self) -> Iterable[str]: + return series_methods if self.is_series(self._obj) else df_methods + + def __array_function__(self, *args, **kwargs): + return self.array.__array_function__(*args, **kwargs) + + def __array_ufunc__(self, *args, **kwargs): + if args[1] == "__call__": + return args[0](self.array, *args[3:], **kwargs) + raise NotImplementedError + + @property + def arrow(self) -> ak.Array: + """Data as an arrow array""" + return self.to_arrow(self._obj) + + @classmethod + def to_arrow(cls, data): + """Data as an arrow array""" + raise NotImplementedError + + @property + def array(self) -> ak.Array: + """Data as an awkward array""" + return ak.from_arrow(self.arrow) + + @property + def str(self): + """Nested string operations""" + from awkward_pandas.strings import StringAccessor + + return StringAccessor(self) + + @property + def dt(self): + """Nested datetime operations""" + from awkward_pandas.datetimes import DatetimeAccessor + + return DatetimeAccessor(self) + + def merge(self): + """Make a single complex series out of the columns of a dataframe""" + if not self.is_dataframe(self._obj): + raise ValueError("Can only merge on a dataframe") + out = {} + for k in self._obj.columns: + # TODO: partial merge when column names are like "record.field" + out[k] = self._obj[k].ak.array + arr = ak.Array(out) + return self.to_output(arr) + + def unmerge(self): + """Make dataframe out of a series of record type""" + arr = self.array + if not arr.fields: + raise ValueError("Not array-of-records") + # TODO: partial unmerge when (some) fields are records + out = {k: self.to_output(arr[k]) for k in arr.fields} + return self.dataframe_type(out) + + @classmethod + def _create_op(cls, op): + """Make functions to perform all the arithmetic, logical and comparison ops""" + + def run(self, *args, **kwargs): + ar2 = (ar.ak.array if hasattr(ar, "ak") else ar for ar in args) + ar3 = (ar.array if isinstance(ar, cls) else ar for ar in ar2) + return self.to_output(op(self.array, *ar3, **kwargs)) + + return run + + def __getattr__(self, item): + if item not in dir(self): + raise AttributeError + func = getattr(ak, item, None) + + if func: + + @functools.wraps(func) + def f(*others, **kwargs): + others = [ + other.ak.array + if isinstance(other, (self.series_type, self.dataframe_type)) + else other + for other in others + ] + kwargs = { + k: v.ak.array + if isinstance(v, (self.series_type, self.dataframe_type)) + else v + for k, v in kwargs.items() + } + + ak_arr = func(self.array, *others, **kwargs) + if isinstance(ak_arr, ak.Array): + return self.to_output(ak_arr) + return ak_arr + + else: + raise AttributeError(item) + return f + + def __init_subclass__(cls, **kwargs): + # auto add methods to all derivative classes + cls._add_all() diff --git a/src/awkward_pandas/pandas.py b/src/awkward_pandas/pandas.py new file mode 100644 index 0000000..5965679 --- /dev/null +++ b/src/awkward_pandas/pandas.py @@ -0,0 +1,37 @@ +import awkward as ak +import pandas as pd +import pyarrow as pa + +from awkward_pandas.mixin import Accessor + + +@pd.api.extensions.register_series_accessor("ak") +@pd.api.extensions.register_dataframe_accessor("ak") +class PandasAwkwardAccessor(Accessor): + series_type = pd.Series + dataframe_type = pd.DataFrame + + @classmethod + def to_arrow(cls, data): + if cls.is_series(data): + return pa.array(data) + return pa.table(data) + + @classmethod + def _to_output(cls, data): + return pd.Series( + pd.arrays.ArrowExtensionArray(ak.to_arrow(data, extensionarray=False)) + ) + + def to_output(self, data): + # override to apply index + arr = pd.arrays.ArrowExtensionArray(ak.to_arrow(data, extensionarray=False)) + if self._obj is not None and len(arr) == len(self._obj.index): + return pd.Series(arr, index=self._obj.index) + else: + return arr + + @staticmethod + def _validate(_): + # required by pandas + return True diff --git a/src/awkward_pandas/polars.py b/src/awkward_pandas/polars.py index 9a67d3d..249a8e3 100644 --- a/src/awkward_pandas/polars.py +++ b/src/awkward_pandas/polars.py @@ -1,91 +1,19 @@ -import functools -from typing import Callable, Iterable, Union - import awkward as ak import polars as pl -from awkward_pandas.mixin import ArithmeticMixin +from awkward_pandas.mixin import Accessor @pl.api.register_series_namespace("ak") @pl.api.register_dataframe_namespace("ak") -class AwkwardOperations(ArithmeticMixin): - def __init__(self, df: pl.DataFrame): - self._df = df - - def __array_function__(self, *args, **kwargs): - return self.array.__array_function__(*args, **kwargs) - - def __array_ufunc__(self, *args, **kwargs): - if args[1] == "__call__": - return args[0](self.array, *args[3:], **kwargs) - raise NotImplementedError - - def __dir__(self) -> Iterable[str]: - return [ - _ - for _ in (dir(ak)) - if not _.startswith(("_", "ak_")) and not _[0].isupper() - ] + ["apply", "array"] - - def apply(self, fn: Callable) -> pl.DataFrame: - """Perform function on all the values of the series""" - out = fn(self.array) - return ak_to_polars(out) - - def __getitem__(self, item): - # scalars? - out = self.array.__getitem__(item) - result = ak_to_polars(out) - return result - - @property - def array(self): - return ak.from_arrow(self._df.to_arrow()) - - def __getattr__(self, item): - if item not in dir(self): - raise AttributeError - func = getattr(ak, item, None) - - if func: - - @functools.wraps(func) - def f(*others, **kwargs): - others = [ - other.ak.array - if isinstance(other, (pl.DataFrame, pl.Series)) - else other - for other in others - ] - kwargs = { - k: v.ak.array if isinstance(v, (pl.DataFrame, pl.Series)) else v - for k, v in kwargs.items() - } - - ak_arr = func(self.array, *others, **kwargs) - if isinstance(ak_arr, ak.Array): - return ak_to_polars(ak_arr) - return ak_arr - - else: - raise AttributeError(item) - return f +class PolarsAwkwardAccessor(Accessor): + series_type = pl.Series + dataframe_type = pl.DataFrame @classmethod - def _create_op(cls, op): - def run(self, *args, **kwargs): - return ak_to_polars(op(self.array, *args, **kwargs)) - - return run - - _create_arithmetic_method = _create_op - _create_comparison_method = _create_op - _create_logical_method = _create_op + def _to_output(cls, arr): + return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) - -AwkwardOperations._add_all() - - -def ak_to_polars(arr: ak.Array) -> Union[pl.DataFrame, pl.Series]: - return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) + @classmethod + def to_arrow(cls, data): + return data.to_arrow() diff --git a/src/awkward_pandas/strings.py b/src/awkward_pandas/strings.py index 918cb30..6a224c7 100644 --- a/src/awkward_pandas/strings.py +++ b/src/awkward_pandas/strings.py @@ -4,9 +4,6 @@ from collections.abc import Callable import awkward as ak -import pandas as pd - -from awkward_pandas.array import AwkwardExtensionArray def _encode(layout): @@ -60,19 +57,27 @@ def decode(arr: ak.Array, encoding: str = "utf-8") -> ak.Array: "isupper": "is_upper", "startswith": "starts_with", } +methods = [ + aname + for aname in (dir(ak.str)) + if not aname.startswith(("_", "akstr_")) and not aname[0].isupper() +] class StringAccessor: def __init__(self, accessor): self.accessor = accessor - def encode(self, encoding: str = "utf-8") -> pd.Series: - """Encode Series of strings to Series of bytes.""" - return pd.Series(AwkwardExtensionArray(encode(self.accessor.array))) + def encode(self, encoding: str = "utf-8"): + """Encode Series of strings to Series of bytes. Leaves non-strings alone.""" + return self.accessor.to_output(encode(self.accessor.array, encoding=encoding)) + + def decode(self, encoding: str = "utf-8"): + """Decode Series of bytes to Series of strings. Leaves non-bytestrings alone. - def decode(self, encoding: str = "utf-8") -> pd.Series: - """Decode Series of bytes to Series of strings.""" - return pd.Series(AwkwardExtensionArray(decode(self.accessor.array))) + Validity of UTF8 is *not* checked. + """ + return self.accessor.to_output(decode(self.accessor.array, encoding=encoding)) @staticmethod def method_name(attr: str) -> str: @@ -85,16 +90,12 @@ def __getattr__(self, attr: str) -> Callable: @functools.wraps(fn) def f(*args, **kwargs): arr = fn(self.accessor.array, *args, **kwargs) - idx = self.accessor._obj.index + # idx = self.accessor._obj.index if isinstance(arr, ak.Array): - return pd.Series(AwkwardExtensionArray(arr), index=idx) + return self.accessor.to_output(arr) return arr return f def __dir__(self) -> list[str]: - return [ - aname - for aname in (dir(ak.str)) - if not aname.startswith(("_", "akstr_")) and not aname[0].isupper() - ] + return sorted(methods) diff --git a/tests/conftest.py b/tests/conftest.py index 518b1b3..d3eb6de 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,18 +3,8 @@ import numpy as np import pandas as pd import pytest - -# from pandas.conftest import * from pandas.tests.extension.conftest import * # noqa -from awkward_pandas import AwkwardDtype - - -@pytest.fixture -def dtype(): - """Fixture overriding function in pandas/tests/extension/conftest.py""" - return AwkwardDtype() - @pytest.fixture def data(dtype): diff --git a/tests/test_accessor.py b/tests/test_accessor.py deleted file mode 100644 index 191b949..0000000 --- a/tests/test_accessor.py +++ /dev/null @@ -1,85 +0,0 @@ -import awkward as ak -import pandas as pd -import pytest - -import awkward_pandas - - -def test_len(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4, 5]])) - assert s.ak.count() == 5 - s2 = s.ak.count(axis=1) - assert s2.tolist() == [3, 2] - - -def test_no_access(): - s = pd.Series([1, 2]) - with pytest.raises(Exception): - s.ak.count() - - -def test_getitem(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4, 5]])) - s2 = s.ak[:, :1] - assert isinstance(s2, pd.Series) - assert s2.dtype == "awkward" - assert s2.tolist() == [[6], [4]] - - -def test_to_column_ints(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([6, 2, 3]), name="test") - s2 = s.ak.to_column() - assert s2.dtype == "int64" - - -def test_to_column_strings(): - pytest.importorskip("pyarrow") - s = pd.Series(awkward_pandas.AwkwardExtensionArray(["6", "2", "3"]), name="test") - s2 = s.ak.to_column() - assert s2.dtype == "string[pyarrow]" - - s = pd.Series(awkward_pandas.AwkwardExtensionArray([["6", "2", "3"]]), name="test") - with pytest.raises(ValueError): - s.ak.to_column() - - -def test_to_columns(): - pytest.importorskip("pyarrow") - s = pd.Series( - awkward_pandas.AwkwardExtensionArray( - {"num": [6, 2, 3], "deep": [[0], [], None], "text": ["hi", "ho", "hum"]} - ), - name="test", - ) - df = s.ak.to_columns() - assert df.columns.tolist() == ["num", "text", "test"] - assert df.num.tolist() == [6, 2, 3] - assert df.test.tolist() == [{"deep": [0]}, {"deep": []}, {"deep": None}] - assert df.text.tolist() == ["hi", "ho", "hum"] - - df = s.ak.to_columns(cull=False) - assert df.columns.tolist() == ["num", "text", "test"] - assert df.num.tolist() == [6, 2, 3] - assert df.test[0].tolist() == {"num": 6, "deep": [0], "text": "hi"} - assert df.text.tolist() == ["hi", "ho", "hum"] - - -def test_apply(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4]]), name="test") - applied = s.ak.apply(lambda x: ak.num(x)) - assert applied.values._data.tolist() == ak.num(s.values._data).tolist() - - -def test_dir(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([6, 2, 3]), name="test") - assert "sum" in dir(s.ak) - assert "Array" not in dir(s.ak) - assert "ak_num" not in dir(s.ak) - assert "_util" not in dir(s.ak) - - -def test_array_property(): - a = ak.from_iter([[1, 2, 3], [4, 5], [6]]) - s = pd.Series(awkward_pandas.AwkwardExtensionArray(a)) - # ensure that the array associated with the accessor is the same as the original - assert s.ak.array is a diff --git a/tests/test_array.py b/tests/test_array.py deleted file mode 100644 index 087a184..0000000 --- a/tests/test_array.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - -from awkward_pandas import AwkwardExtensionArray, merge - - -def test_merge_no_ak(): - pytest.importorskip("pyarrow") - df = pd.DataFrame( - { - "a": [1, 2, 3], - "b": ["hay", "ho", "hi"], - "c": pd.Series(["hay", "ho", "hi"], dtype="string[pyarrow]"), - "d": [[1, 2, 3], None, []], - } - ) - s = merge(df, name="test") - assert s.name == "test" - assert s.dtype == "awkward" - assert len(s) == 3 - arr = s.values._data - assert arr.fields == ["a", "b", "c", "d"] - assert arr["a"].tolist() == [1, 2, 3] - assert arr["b"].tolist() == ["hay", "ho", "hi"] - assert arr["c"].tolist() == ["hay", "ho", "hi"] - assert arr["d"].tolist() == [[1, 2, 3], None, []] - - -def test_merge_one_ak(): - df = pd.DataFrame({"a": [1, 2, 3]}) - df["b"] = pd.Series(AwkwardExtensionArray([[1, 2, 3], [5], [6, 7]])) - s = merge(df, name="test") - assert s.name == "test" - assert s.dtype == "awkward" - assert len(s) == 3 - arr = s.values._data - assert arr.fields == ["a", "b"] - assert arr["b"].tolist() == [[1, 2, 3], [5], [6, 7]] - - -def test_parquet_roundtrip(tmp_path): - pytest.importorskip("pyarrow") - df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": pd.Series(AwkwardExtensionArray([[1, 2, 3], [5], [6, 7], [], None])), - } - ) - - assert df["b"].dtype == "awkward" - - path = tmp_path / "output.parquet" - df.to_parquet(path, engine="pyarrow") - result = pd.read_parquet(path) - - assert_frame_equal(df, result) diff --git a/tests/test_base.py b/tests/test_base.py deleted file mode 100644 index 6a3240c..0000000 --- a/tests/test_base.py +++ /dev/null @@ -1,29 +0,0 @@ -from __future__ import annotations - -import awkward as ak -import pandas as pd -import pytest - -import awkward_pandas - - -def test_select(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4, 5]])) - s2 = s[0] - assert isinstance(s2, ak.Array) - assert s2.tolist() == [6, 2, 3] - - s2 = s[0:1] - assert s2.dtype == "awkward" - assert isinstance(s2.values, awkward_pandas.AwkwardExtensionArray) - assert s2.tolist() == [[6, 2, 3]] - - -@pytest.mark.xfail(reason='numpy dtype("O") comparison giving issues') -def test_astype_to_ak(): - s = pd.Series([[6, 2, 3], [4, 5]], dtype=object) - s2 = s.astype("awkward") - assert s2.dtype == "awkward" - assert s2.tolist() == [[6, 2, 3], [4, 5]] - assert (s2 == s).tolist() == [[True, True, True], [True, True]] - assert (s2 == s).all() diff --git a/tests/test_dask.py b/tests/test_dask.py index ee36a96..6e87c29 100644 --- a/tests/test_dask.py +++ b/tests/test_dask.py @@ -1,5 +1,5 @@ -import awkward as ak import pandas as pd +import pyarrow as pa import pytest import awkward_pandas # noqa @@ -8,22 +8,22 @@ def test_simple_map(): - data = [[0], [0, 1]] * 2 - s = pd.Series(data, dtype="awkward") + data = pd.arrays.ArrowExtensionArray(pa.array([[0], [0, 1]] * 2)) + s = pd.Series(data) df = pd.DataFrame({"s": s}) ddf = dd.from_pandas(df, 2) - out = ddf.s.map(ak.count) + out = ddf.s.ak.count(axis=0) assert out.dtype == "int64" - assert out.compute().tolist() == [1, 2, 1, 2] + result = out.compute(scheduler="sync") + assert set(result) == {1, 2} - out = ddf + 1 - assert out.s.dtype == "awkward" - assert out.compute().s.tolist() == [[1], [1, 2]] * 2 + out = ddf.s.ak + 1 + assert out.compute(scheduler="sync").ak.to_list() == [[1], [1, 2]] * 2 def test_accessor(): - data = [[0], [0, 1]] * 2 - s = pd.Series(data, dtype="awkward") + data = pd.arrays.ArrowExtensionArray(pa.array([[0], [0, 1]] * 2)) + s = pd.Series(data) df = pd.DataFrame({"s": s}) ddf = dd.from_pandas(df, 2) out = ddf.s.ak.count() @@ -35,9 +35,10 @@ def test_accessor(): def test_distributed(): distributed = pytest.importorskip("distributed") + with distributed.Client(n_workers=1, threads_per_worker=1): - data = [[0], [0, 1]] * 2 - s = pd.Series(data, dtype="awkward") + data = pd.arrays.ArrowExtensionArray(pa.array([[0], [0, 1]] * 2)) + s = pd.Series(data) df = pd.DataFrame({"s": s}) ddf = dd.from_pandas(df, 2) out = ddf.s.ak.count() diff --git a/tests/test_dt.py b/tests/test_dt.py new file mode 100644 index 0000000..807da4d --- /dev/null +++ b/tests/test_dt.py @@ -0,0 +1,18 @@ +import datetime + +import pytest + +import awkward_pandas.pandas # noqa + +pd = pytest.importorskip("pandas") + + +def test_cast(): + s = pd.Series([[0, 1], [1, 0], [2]]) + out = s.ak.dt.cast("timestamp[s]") + assert str(out.dtype) == "list[pyarrow]" + assert out.to_list() == [ + [datetime.datetime(1970, 1, 1, 0, 0), datetime.datetime(1970, 1, 1, 0, 0, 1)], + [datetime.datetime(1970, 1, 1, 0, 0, 1), datetime.datetime(1970, 1, 1, 0, 0)], + [datetime.datetime(1970, 1, 1, 0, 0, 2)], + ] diff --git a/tests/test_pandas.py b/tests/test_pandas.py new file mode 100644 index 0000000..f2ae855 --- /dev/null +++ b/tests/test_pandas.py @@ -0,0 +1,50 @@ +import awkward as ak +import pandas as pd +import pytest + +pytest.importorskip("awkward_pandas.pandas") + + +def test_len(): + s = pd.Series([[6, 2, 3], [4, 5]]) + assert s.ak.count() == 5 + s2 = s.ak.count(axis=1) + assert s2.tolist() == [3, 2] + + +def test_getitem(): + s = pd.Series([[6, 2, 3], [4, 5]]) + s2 = s.ak[:, :1] + assert isinstance(s2, pd.Series) + assert s2.tolist() == [[6], [4]] + + +def test_apply(): + s = pd.Series([[6, 2, 3], [4]], name="test") + applied = s.ak.apply(lambda x: ak.num(x)) + assert applied.tolist() == ak.num(s.ak.array).tolist() + + +def test_dir(): + s = pd.Series([6, 2, 3], name="test") + assert "sum" in dir(s.ak) + assert "Array" not in dir(s.ak) + assert "ak_num" not in dir(s.ak) + assert "_util" not in dir(s.ak) + + +def test_array_property(): + a = [[1, 2, 3], [4, 5], [6]] + s = pd.Series(a) + # ensure that the array associated with the accessor is the same as the original + assert isinstance(s.ak.array, ak.Array) + assert a == s.ak.array.tolist() + + +def test_ufunc(): + a = [[1, 2, 3], [4, 5], [6]] + s = pd.Series(a) + assert (s.ak + 1).tolist() == [[2, 3, 4], [5, 6], [7]] + + assert (s.ak + s.ak).tolist() == [[2, 4, 6], [8, 10], [12]] + assert (s.ak + s).tolist() == [[2, 4, 6], [8, 10], [12]] diff --git a/tests/test_polars.py b/tests/test_polars.py index ff5778a..3ecf817 100644 --- a/tests/test_polars.py +++ b/tests/test_polars.py @@ -1,9 +1,8 @@ import numpy as np import pytest -import awkward_pandas.polars # noqa: F401 - pl = pytest.importorskip("polars") +pytest.importorskip("awkward_pandas.polars") def test_simple(): diff --git a/tests/test_str.py b/tests/test_str.py index 9ce6ab4..9d50f28 100644 --- a/tests/test_str.py +++ b/tests/test_str.py @@ -1,15 +1,11 @@ -from __future__ import annotations - import pandas as pd import pytest -pytest.importorskip("pyarrow") - @pytest.mark.parametrize("binary", [True, False]) @pytest.mark.parametrize("method", ["upper", "capitalize", "isalpha"]) def test_unary_methods(method, binary): - s = pd.Series(["hello world", "oi"], dtype="awkward") + s = pd.Series(["hello world", "oi"]) if binary: s = s.ak.str.encode() out = getattr(s.ak.str, method)() @@ -18,7 +14,7 @@ def test_unary_methods(method, binary): def test_with_argument(): - s = pd.Series(["hello world", "oi"], dtype="awkward") + s = pd.Series(["hello world", "oi"]) out1 = s.ak.str.starts_with("hello") out2 = s.ak.str.startswith("hello") expected = [_.startswith("hello") for _ in s.tolist()] @@ -27,7 +23,7 @@ def test_with_argument(): def test_encode_decode(): - s = pd.Series(["hello world", "oi"], dtype="awkward") + s = pd.Series(["hello world", "oi"]) s2 = s.ak.str.encode() assert s2.tolist() == [_.encode() for _ in s.tolist()] s3 = s2.ak.str.decode() @@ -35,12 +31,12 @@ def test_encode_decode(): def test_split(): - s = pd.Series(["hello world", "oio", ""], dtype="awkward") + s = pd.Series(["hello world", "oio", ""]) s2 = s.ak.str.split_whitespace() assert s2.tolist() == [["hello", "world"], ["oio"], [""]] s2 = s.ak.str.split_pattern("i") assert s2.tolist() == [["hello world"], ["o", "o"], [""]] - s = pd.Series([b"hello world", b"oio", b""], dtype="awkward") + s = pd.Series([b"hello world", b"oio", b""]) s2 = s.ak.str.split_whitespace() assert s2.tolist() == [[b"hello", b"world"], [b"oio"], [b""]] diff --git a/tests/test_upstream_extension_tests.py b/tests/test_upstream_extension_tests.py deleted file mode 100644 index 9d15b5c..0000000 --- a/tests/test_upstream_extension_tests.py +++ /dev/null @@ -1,153 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pandas._testing as tm -from pandas.tests.extension.base import BaseConstructorsTests, BaseDtypeTests -from pandas.tests.extension.base.casting import BaseCastingTests # noqa -from pandas.tests.extension.base.dim2 import Dim2CompatTests # noqa -from pandas.tests.extension.base.dim2 import NDArrayBacked2DTests # noqa -from pandas.tests.extension.base.getitem import BaseGetitemTests # noqa -from pandas.tests.extension.base.groupby import BaseGroupbyTests # noqa -from pandas.tests.extension.base.index import BaseIndexTests # noqa -from pandas.tests.extension.base.interface import BaseInterfaceTests # noqa -from pandas.tests.extension.base.io import BaseParsingTests # noqa -from pandas.tests.extension.base.methods import BaseMethodsTests # noqa -from pandas.tests.extension.base.missing import BaseMissingTests # noqa -from pandas.tests.extension.base.ops import BaseArithmeticOpsTests # noqa -from pandas.tests.extension.base.ops import BaseComparisonOpsTests # noqa -from pandas.tests.extension.base.ops import BaseOpsUtil # noqa -from pandas.tests.extension.base.ops import BaseUnaryOpsTests # noqa -from pandas.tests.extension.base.printing import BasePrintingTests # noqa -from pandas.tests.extension.base.reduce import BaseBooleanReduceTests # noqa -from pandas.tests.extension.base.reduce import BaseNoReduceTests # noqa -from pandas.tests.extension.base.reduce import BaseNumericReduceTests # noqa -from pandas.tests.extension.base.reshaping import BaseReshapingTests # noqa -from pandas.tests.extension.base.setitem import BaseSetitemTests # noqa - -import awkward_pandas - - -def test_version(): - assert awkward_pandas.__version__ - - -class TestAwkwardDtype(BaseDtypeTests): - pass - - -class TestAwkwardConstructors(BaseConstructorsTests): - def test_series_constructor_scalar_with_index(self, data, dtype): - assert True - - # Overridden because pd.DataFrame(list(AwkwardExtensionArray)) - # won't work. - def test_from_dtype(self, data): - # construct from our dtype & string dtype - dtype = data.dtype - - expected = pd.Series(data) - result = pd.Series(list(data), dtype=dtype) - tm.assert_series_equal(result, expected) - - result = pd.Series(list(data), dtype=str(dtype)) - tm.assert_series_equal(result, expected) - - # this is the test that breaks the upstream version - # expected = pd.DataFrame(data).astype(dtype) - # result = pd.DataFrame(list(data), dtype=dtype) - # tm.assert_frame_equal(result, expected) - - -# class TestAwkwardBaseCastingTests(BaseCastingTests): - -# # Overridden because list(AwkwardExtensionArray) will contain -# # ak.Array as elements, not python objects. -# def test_tolist(self, data): -# result = pd.Series(data).tolist() -# expected = data.tolist() -# assert result == expected - -# result = list(pd.Series(data)) -# expected = list(data) -# for res, exp in zip(result, expected): -# assert ak.all(res == exp) - - -# class TestAwkwardBaseGetitemTests(BaseGetitemTests): -# pass - - -# class TestAwkwardBaseGroupbyTests(BaseGroupbyTests): -# pass - - -# class TestAwkwardBaseIndexTests(BaseIndexTests): -# pass - - -# class TestAwkwardBaseInterfaceTests(BaseInterfaceTests): -# pass - - -# class TestAwkwardDim2CompatTests(Dim2CompatTests): -# pass - - -# # Not compatible with awkward array -# # class TestAwkwardNDArrayBacked2DTests(NDArrayBacked2DTests): -# # pass - - -# class TestAwkwardBaseParsingTests(BaseParsingTests): -# pass - - -# class TestAwkwardBaseMethodsTests(BaseMethodsTests): -# pass - - -# class TestAwkwardBaseMissingTests(BaseMissingTests): -# pass - - -# class TestAwkwardBaseArithmeticOpsTests(BaseArithmeticOpsTests): -# pass - - -# class TestAwkwardBaseComparisonOpsTests(BaseComparisonOpsTests): -# pass - - -# class TestAwkwardBaseOpsUtil(BaseOpsUtil): -# pass - - -# class TestAwkwardBaseUnaryOpsTests(BaseUnaryOpsTests): -# pass - - -# class TestAwkwardBasePrintingTests(BasePrintingTests): -# pass - - -# class TestAwkwardBaseBooleanReduceTests(BaseBooleanReduceTests): -# pass - - -# class TestAwkwardBaseNoReduceTests(BaseNoReduceTests): -# pass - - -# class TestAwkwardBaseNumericReduceTests(BaseNumericReduceTests): -# pass - - -# class TestAwkwardBaseReshapingTests(BaseReshapingTests): -# def test_ravel(self, data): -# result = data.ravel() -# assert type(result) == type(data) -# result._data is data._data - - -# # class TestAwkwardBaseSetitemTests(BaseSetitemTests): -# # pass