Skip to content

Commit

Permalink
Merge pull request #53 from martindurant/dt
Browse files Browse the repository at this point in the history
Add all unary dt methods
  • Loading branch information
martindurant authored May 8, 2024
2 parents f9641b5 + a9fc3af commit d3ba4c4
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 185 deletions.
239 changes: 56 additions & 183 deletions src/awkward_pandas/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,206 +1,79 @@
from __future__ import annotations

import functools
import inspect

import awkward as ak
import pyarrow as pa
import pyarrow.compute as pc


def _run_unary(layout, op, kind=None, **kw):
if layout.is_record:
[_run_unary(_, op, kind=kind, **kw) for _ in layout._contents]
elif layout.is_leaf and (kind is None or layout.dtype.kind == kind):
layout._data = ak.str._apply_through_arrow(op, layout, **kw).data
elif layout.is_option or layout.is_list:
_run_unary(layout.content, op, kind=kind, **kw)
if layout.is_leaf and (kind is None or layout.dtype.kind == kind):
return ak.str._apply_through_arrow(op, layout, **kw)
if layout.is_list and layout.parameter("__array__") in ["bytestring", "string"]:
return ak.str._apply_through_arrow(op, layout, **kw)


def run_unary(arr: ak.Array, op, kind=None, **kw) -> ak.Array:
arr2 = ak.copy(arr)
_run_unary(arr2.layout, op, kind=kind, **kw)
return ak.Array(arr2)
def func(x, **kwargs):
return _run_unary(x, op, kind=kind, **kw)

return ak.transform(func, arr)

class DatetimeAccessor:
def __init__(self, accessor) -> None:
self.accessor = accessor

def cast(self, target_type=None, safe=None, options=None):
"""Cast values to given type
def dec(func, mode="unary"):
# TODO: require kind= on functions that need timestamps

This may be the easiest way to make time types from scratch
if mode == "unary":
# TODO: modily __doc__?
@functools.wraps(func)
def f(self, *args, **kwargs):
if args:
sig = list(inspect.signature(func).parameters)[1:]
kwargs.update({k: arg for k, arg in zip(sig, args)})

Examples
--------
>>> import pandas as pd
>>> import awkward_pandas.pandas
>>> s = pd.Series([[0, 1], [1, 0], [2]])
>>> s.ak.dt.cast("timestamp[s]")
0 ['1970-01-01T00:00:00' '1970-01-01T00:00:01']
1 ['1970-01-01T00:00:01' '1970-01-01T00:00:00']
2 ['1970-01-01T00:00:02']
dtype: list<item: timestamp[s]>[pyarrow]
"""
return self.accessor.to_output(
run_unary(
self.accessor.array,
pc.cast,
target_type=target_type,
safe=safe,
options=options,
return self.accessor.to_output(
run_unary(self.accessor.array, func, **kwargs)
)
)

def ceil_temporal(
self,
/,
multiple=1,
unit="day",
*,
week_starts_monday=True,
ceil_is_strictly_greater=False,
calendar_based_origin=False,
options=None,
):
raise NotImplementedError("TODO")
else:
raise NotImplementedError
return f

def floor_temporal(
self,
/,
multiple=1,
unit="day",
*,
week_starts_monday=True,
ceil_is_strictly_greater=False,
calendar_based_origin=False,
options=None,
):
raise NotImplementedError("TODO")

def round_temporal(
self,
/,
multiple=1,
unit="day",
*,
week_starts_monday=True,
ceil_is_strictly_greater=False,
calendar_based_origin=False,
options=None,
):
raise NotImplementedError("TODO")

def run_end_decode(self, array):
raise NotImplementedError("TODO")

def run_end_encode(
self,
/,
run_end_type=pa.int32(),
*,
options=None,
):
raise NotImplementedError("TODO")

def strftime(
self,
/,
format="%Y-%m-%dT%H:%M:%S",
locale="C",
*,
options=None,
):
raise NotImplementedError("TODO")

def strptime(
self,
/,
format,
unit,
error_is_null=False,
*,
options=None,
):
raise NotImplementedError("TODO")

def day(self):
raise NotImplementedError("TODO")

def day_of_week(
self,
/,
*,
count_from_zero=True,
week_start=1,
options=None,
):
raise NotImplementedError("TODO")

def day_of_year(self):
raise NotImplementedError("TODO")

def hour(self):
raise NotImplementedError("TODO")

def iso_week(self):
raise NotImplementedError("TODO")

def iso_year(self):
raise NotImplementedError("TODO")

def iso_calendar(self):
raise NotImplementedError("TODO")

def is_leap_year(self):
raise NotImplementedError("TODO")

def microsecond(self):
raise NotImplementedError("TODO")

def millisecond(self):
raise NotImplementedError("TODO")

def minute(self):
raise NotImplementedError("TODO")

def month(self):
raise NotImplementedError("TODO")

def nanosecond(self):
raise NotImplementedError("TODO")

def quarter(self):
raise NotImplementedError("TODO")

def second(self):
raise NotImplementedError("TODO")

def subsecond(self):
raise NotImplementedError("TODO")

def us_week(self):
raise NotImplementedError("TODO")

def us_year(self):
raise NotImplementedError("TODO")

def week(
self,
/,
*,
week_starts_monday=True,
count_from_zero=False,
first_week_is_fully_in_year=False,
options=None,
):
raise NotImplementedError("TODO")

def year(self):
raise NotImplementedError("TODO")

def year_month_day(self):
raise NotImplementedError("TODO")
class DatetimeAccessor:
def __init__(self, accessor) -> None:
self.accessor = accessor

cast = dec(pc.cast)
ceil_temporal = dec(pc.ceil_temporal)
floor_temporal = dec(pc.floor_temporal)
reound_temporal = dec(pc.round_temporal)
strftime = dec(pc.strftime)
strptime = dec(pc.strptime)
day = dec(pc.day)
day_of_week = dec(pc.day_of_week)
day_of_year = dec(pc.day_of_year)
hour = dec(pc.hour)
iso_week = dec(pc.iso_week)
iso_year = dec(pc.iso_year)
iso_calendar = dec(pc.iso_calendar)
is_leap_year = dec(pc.is_leap_year)
microsecond = dec(pc.microsecond)
millisecond = dec(pc.millisecond)
minute = dec(pc.minute)
month = dec(pc.month)
nanosecond = dec(pc.nanosecond)
quarter = dec(pc.quarter)
second = dec(pc.second)
subsecond = dec(pc.subsecond)
us_week = dec(pc.us_week)
us_year = dec(pc.us_year)
week = dec(pc.week)
year = dec(pc.year)
year_month_day = dec(pc.year_month_day)

# the rest are binary
def day_time_interval_between(self, end):
raise NotImplementedError("TODO")

Expand Down
13 changes: 11 additions & 2 deletions src/awkward_pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@ def read_parquet(
extract: bool = True,
**kwargs,
):
"""Read a Parquet dataset with nested data into a Series or DataFrame.
"""Read a Parquet dataset with nested data into a pandas Series or DataFrame.
This may cope with some deeply nested structures that pandas refuses
to read by itself.
You can pass a selection of columns to read (list of strings), and
other columns will not be parsed into memory.
Parameters
----------
Expand All @@ -34,7 +40,10 @@ def read_json(
extract=True,
**kwargs,
):
"""Read a JSON dataset with nested data into a Series or DataFrame.
"""Read a JSON dataset with nested data into a pandas Series or DataFrame.
You can pass a selection of columns to read (list or jsonschema format), and
other columns will not be parsed into memory.
Parameters
----------
Expand Down
14 changes: 14 additions & 0 deletions tests/test_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,17 @@ def test_cast():
[datetime.datetime(1970, 1, 1, 0, 0, 1), datetime.datetime(1970, 1, 1, 0, 0)],
[datetime.datetime(1970, 1, 1, 0, 0, 2)],
]


def test_unary_unit():
s = pd.Series([[0, 1], [1, 0], [2]])
ts = s.ak.dt.cast("timestamp[s]")
s2 = ts.ak.dt.second()
assert s.to_list() == s2.to_list()


def test_bad_type():
# consider more specific exception rather than hitting arrow's one
s = pd.Series([[0, 1], [1, 0], [2]])
with pytest.raises(NotImplementedError):
s.ak.dt.second()

0 comments on commit d3ba4c4

Please sign in to comment.