Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional temporal arithmetic #696

Merged
merged 28 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d85c4ff
consistent newlines above return
billylanchantin Aug 25, 2023
51f9fd0
Merge branch 'main' into date-diff-and-cleanup
billylanchantin Aug 25, 2023
4b1c6fc
first pass at redoing arithmetic operations
billylanchantin Aug 26, 2023
3c5b71b
minor error message change
billylanchantin Aug 26, 2023
f4800dc
add types
billylanchantin Aug 26, 2023
849d3ad
add/2 related tests
billylanchantin Aug 26, 2023
2c94cc1
fix default precision for date - date
billylanchantin Aug 26, 2023
95908af
subtract/2 tests
billylanchantin Aug 26, 2023
e26d0ae
more coverage for duration-only arithmetic
billylanchantin Aug 26, 2023
a452658
allow multiplying durations and floats
billylanchantin Aug 26, 2023
15b65c8
multiplication tests
billylanchantin Aug 26, 2023
aeafe8f
support divide by float
billylanchantin Aug 26, 2023
be71f9a
need to cast on divide too
billylanchantin Aug 26, 2023
cbc69e2
use consistent comment
billylanchantin Aug 26, 2023
12b50ad
divide tests
billylanchantin Aug 26, 2023
3445824
reuse error format
billylanchantin Aug 26, 2023
a843db9
remove unused module attribute
billylanchantin Aug 26, 2023
d97a44a
fix dialyzer warning
billylanchantin Aug 26, 2023
8672d16
centralize dtype logic
billylanchantin Aug 26, 2023
0861619
Merge branch 'main' into date-diff-and-cleanup
billylanchantin Aug 26, 2023
093e9ce
move arithmetic casting logic to shared
billylanchantin Aug 27, 2023
d940a3d
move argument swap for add/2 into rust
billylanchantin Aug 27, 2023
2dbf652
swap in Elixir instead
billylanchantin Aug 27, 2023
75d8a78
move multiply & divide workarounds to polars backend
billylanchantin Aug 27, 2023
25d0610
move defp contents into def
billylanchantin Aug 27, 2023
66a317f
failed attempt at arg-swapping expressions
billylanchantin Aug 27, 2023
50d4e3a
allow duration scalars in expressions
billylanchantin Aug 29, 2023
b751989
remove expression fix attempt; keep but skip tests
billylanchantin Aug 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 15 additions & 69 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ defmodule Explorer.Backend.LazySeries do

@comparison_operations [:equal, :not_equal, :greater, :greater_equal, :less, :less_equal]

@arithmetic_operations [:pow, :quotient, :remainder]
@basic_arithmetic_operations [:add, :subtract, :multiply, :divide]
@other_arithmetic_operations [:pow, :quotient, :remainder]

@aggregation_operations [
:sum,
Expand Down Expand Up @@ -192,38 +193,6 @@ defmodule Explorer.Backend.LazySeries do
Backend.Series.new(data, dtype)
end

@impl true
def add(left, right) do
args = [data!(left), data!(right)]
data = new(:add, args, aggregations?(args))
dtype = resolve_numeric_temporal_dtype(:add, left, right)
Backend.Series.new(data, dtype)
end

@impl true
def subtract(left, right) do
args = [data!(left), data!(right)]
data = new(:subtract, args, aggregations?(args))
dtype = resolve_numeric_temporal_dtype(:subtract, left, right)
Backend.Series.new(data, dtype)
end

@impl true
def multiply(left, right) do
args = [data!(left), data!(right)]
data = new(:multiply, args, aggregations?(args))
dtype = resolve_numeric_temporal_dtype(:multiply, left, right)
Backend.Series.new(data, dtype)
end

@impl true
def divide(left, right) do
args = [data!(left), data!(right)]
data = new(:divide, args, aggregations?(args))
dtype = resolve_numeric_temporal_dtype(:divide, left, right)
Backend.Series.new(data, dtype)
end

@impl true
def from_list(list, dtype) when is_list(list) and dtype in @valid_dtypes do
data = new(:from_list, [list, dtype], false)
Expand Down Expand Up @@ -412,7 +381,19 @@ defmodule Explorer.Backend.LazySeries do
end
end

for op <- @arithmetic_operations do
for op <- @basic_arithmetic_operations do
@impl true
def unquote(op)(%Series{} = left, %Series{} = right) do
dtype = Explorer.Shared.cast_to_arithmetic(unquote(op), dtype(left), dtype(right))

args = [data!(left), data!(right)]
data = new(unquote(op), args, aggregations?(args))

Backend.Series.new(data, dtype)
end
end

for op <- @other_arithmetic_operations do
@impl true
def unquote(op)(left, right) do
dtype = resolve_numeric_dtype([left, right])
Expand Down Expand Up @@ -654,41 +635,6 @@ defmodule Explorer.Backend.LazySeries do
defp resolve_numeric_dtype(:window_mean, _items), do: :float
defp resolve_numeric_dtype(_op, items), do: resolve_numeric_dtype(items)

defp resolve_numeric_temporal_dtype(op, %Series{dtype: ldt} = left, %Series{dtype: rdt} = right) do
case {op, ldt, rdt} do
{:add, {:datetime, ltu}, {:duration, rtu}} -> {:datetime, highest_precision(ltu, rtu)}
{:add, {:duration, ltu}, {:datetime, rtu}} -> {:datetime, highest_precision(ltu, rtu)}
{:add, {:duration, ltu}, {:duration, rtu}} -> {:duration, highest_precision(ltu, rtu)}
{:subtract, {:datetime, ltu}, {:datetime, rtu}} -> {:duration, highest_precision(ltu, rtu)}
{:subtract, {:datetime, ltu}, {:duration, rtu}} -> {:datetime, highest_precision(ltu, rtu)}
{:subtract, {:duration, ltu}, {:duration, rtu}} -> {:duration, highest_precision(ltu, rtu)}
{:multiply, :integer, {:duration, tu}} -> {:duration, tu}
{:multiply, {:duration, tu}, :integer} -> {:duration, tu}
{:divide, {:duration, tu}, :integer} -> {:duration, tu}
{:divide, _, {:duration, _}} -> raise("cannot divide by duration")
{:divide, _, _} -> :float
_ -> resolve_numeric_dtype([left, right])
end
end

defp resolve_numeric_temporal_dtype(op, left, right) do
case op do
:divide -> :float
_ -> resolve_numeric_dtype([left, right])
end
end

defp highest_precision(left_timeunit, right_timeunit) do
# Higher precision wins, otherwise information is lost.
case {left_timeunit, right_timeunit} do
{equal, equal} -> equal
{:nanosecond, _} -> :nanosecond
{_, :nanosecond} -> :nanosecond
{:microsecond, _} -> :microsecond
{_, :microsecond} -> :microsecond
end
end

# Returns the inner `data` if it's a lazy series. Otherwise raises an error.
defp lazy_series!(series) do
case series do
Expand Down
3 changes: 3 additions & 0 deletions lib/explorer/duration.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ defmodule Explorer.Duration do
@enforce_keys [:value, :precision]
defstruct [:value, :precision]

@type precision :: :millisecond | :microsecond | :nanosecond
@type t :: %__MODULE__{value: integer(), precision: precision()}

# Nanosecond constants
@us_ns 1_000
@ms_ns 1_000 * @us_ns
Expand Down
40 changes: 40 additions & 0 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
exprs =
for {name, lazy_series} <- column_pairs do
lazy_series
|> maybe_swap_args(df)
|> to_expr()
|> alias_expr(name)
end
Expand Down Expand Up @@ -821,4 +822,43 @@ defmodule Explorer.PolarsBackend.DataFrame do
def inspect(df, opts) do
Explorer.Backend.DataFrame.inspect(df, "Polars", n_rows(df), opts)
end

# Helpers

defp maybe_swap_args(%Explorer.Backend.LazySeries{op: op, args: args}, df) do
maybe_swapped_args =
case op do
:add ->
case List.pop_at(args, -1) do
{%Explorer.PolarsBackend.Series{} = last, rest} ->
case Explorer.PolarsBackend.Native.s_dtype(last) do
{:ok, "date"} ->
[last | rest]

_ ->
args
end

{%Explorer.Backend.LazySeries{op: :column, args: [col_name]} = last, rest} ->
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps the best place to swap args is in expression.ex. In there we convert all lazy frame operations into Polars expressions, so you can override add, multiply, and friends in there to both swap and post-cast.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried there initially, but lazy series didn't seem to contain the dtype information I needed to make decisions about whether to swap, cast, etc.

E.g. if we add this custom to_expr clause for add:

  def to_expr(%LazySeries{op: :add, args: [left, right]}) do
    left |> IO.inspect(label: :left, width: 120)
    right |> IO.inspect(label: :right, width: 120)

    Native.expr_add(to_expr(left), to_expr(right))
  end

We see the following after running df1 = DF.mutate(df, add1: sub + aug_20):

left: %Explorer.Backend.LazySeries{op: :column, args: ["aug_20"], aggregation: false}
right: %Explorer.Backend.LazySeries{op: :column, args: ["sub"], aggregation: false}

I don't know the dtypes of those args, so I don't know if I should swap or not.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GAH. Does it work in expression.rs? Can we fetch the dtype there and swap there?

If not, we would need to start storing the dtype then... it is more work though, so it probably makes sense to break this apart, add the dtype field to lazy series, and then come back to this. If we indeed can't do it in Rust, then we should merge a subset of this PR that doesn't require the arg swapping, and work on the other fix later.

I guess there is always the option of fixing Polars too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it work in expression.rs? Can we fetch the dtype there and swap there?

I don't think so? I'll try to check more thoroughly later today tho. Polars expressions weren't super well documented so I'll have to read some source code.

If we indeed can't do it in Rust, then we should merge a subset of this PR that doesn't require the arg swapping, and work on the other fix later.

Do you mean supporting Explorer.Series.add(duration, date) a letting a lazy expression possibly panic? Or dropping duration + date support altogether until we can address the issue?

I guess there is always the option of fixing Polars too.

You read my mind...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean supporting Explorer.Series.add(duration, date) a letting a lazy expression possibly panic?

I meant adding a field called dtype to Explorer.Backend.LazySeries. If we do this though, I would make it so we change all Explorer.Backends.Series callbacks to receive the output dtype as argument (or an out_series, which has the correct result type, like we do for dataframes).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I meant what should we do for this PR? Definitely coming back later and adding the dtype info we need to make decisions sounds good.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this PR, let's only support the dtype pairs we can implement today (without changing polars or without changing lazy series). Or do you mean we cannot support any of them? :D

Copy link
Member

@josevalim josevalim Aug 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, let's keep the arg swap in Explorer.Series for now. It is not ideal but it is semantically fine. We can add a TODO to them.

For the integer vs duration, which requires the post cast, we can pre-cast the integer to duration, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, let's keep the arg swap in Explorer.Series for now. It is not ideal but it is semantically fine. We can add a TODO to them.

👍

For the integer vs duration, which requires the post cast, we can pre-cast the integer to duration, right?

No, duration * duration isn't supported. I think only way to do it is post-cast.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I honestly think opening up a PR in Polars might be the least painful way accomplish this 😅

case df[col_name] do
%Explorer.Series{dtype: :date} ->
[last | rest]

_ ->
args
end

_ ->
args
end

_ ->
args
end

args = Enum.map(maybe_swapped_args, &maybe_swap_args(&1, df))
%Explorer.Backend.LazySeries{op: op, args: args}
end

defp maybe_swap_args(arg, _df), do: arg
end
47 changes: 41 additions & 6 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -279,20 +279,55 @@ defmodule Explorer.PolarsBackend.Series do
# Arithmetic

@impl true
def add(left, right),
do: Shared.apply_series(matching_size!(left, right), :s_add, [right.data])
def add(left, right) do
left = matching_size!(left, right)

# `duration + date` is not supported by polars for some reason.
# `date + duration` is, so we're swapping arguments as a work around.
[left, right] =
case {dtype(left), dtype(right)} do
{{:duration, _}, :date} -> [right, left]
_ -> [left, right]
end

Shared.apply_series(left, :s_add, [right.data])
end

@impl true
def subtract(left, right),
do: Shared.apply_series(matching_size!(left, right), :s_subtract, [right.data])

@impl true
def multiply(left, right),
do: Shared.apply_series(matching_size!(left, right), :s_multiply, [right.data])
def multiply(left, right) do
result = Shared.apply_series(matching_size!(left, right), :s_multiply, [right.data])
expected_dtype = Explorer.Shared.cast_to_arithmetic(:multiply, dtype(left), dtype(right))

# Polars currently returns inconsistent dtypes, e.g.:
# * `integer * duration -> duration` when `integer` is a scalar
# * `integer * duration -> integer` when `integer` is a series
# We need to return duration in these cases, so we need an additional cast.
if match?({:duration, _}, expected_dtype) and expected_dtype != dtype(result) do
cast(result, expected_dtype)
else
result
end
end

@impl true
def divide(left, right),
do: Shared.apply_series(matching_size!(left, right), :s_divide, [right.data])
def divide(left, right) do
result = Shared.apply_series(matching_size!(left, right), :s_divide, [right.data])
expected_dtype = Explorer.Shared.cast_to_arithmetic(:divide, dtype(left), dtype(right))

# Polars currently returns inconsistent dtypes, e.g.:
# * `duration / integer -> duration` when `integer` is a scalar
# * `duration / integer -> integer` when `integer` is a series
# We need to return duration in these cases, so we need an additional cast.
if match?({:duration, _}, expected_dtype) and expected_dtype != dtype(result) do
cast(result, expected_dtype)
else
result
end
end

@impl true
def quotient(left, right),
Expand Down
Loading
Loading