Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support more integer dtypes in Series #824

Merged
merged 22 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7968ebc
Support signed/unsigned dtypes in aggregation for series
philss Jan 5, 2024
8a937c6
WIP: need to fix casting before subtract
philss Jan 8, 2024
96eb5e0
Fix issues after rebase
philss Jan 9, 2024
c8a554e
Fix min/max, categorise and subtract from Series
philss Jan 9, 2024
55e41d9
Divide will always give a f64
philss Jan 9, 2024
58bf029
Make Series.subtract/2 work by mixing int dtypes
philss Jan 15, 2024
9bc1f43
More tests to `Series.divide/2`
philss Jan 15, 2024
1951705
Fix `Series.in/2` to support mixing integer dtypes
philss Jan 15, 2024
ad034fe
Fix Series.peaks/2 to support more numeric dtypes
philss Jan 15, 2024
750327f
Fix Series.quotient/2 to work with mixied int dtypes
philss Jan 15, 2024
cb562c7
Make `Series.rank/2` return a u32 series for ordinal ranking
philss Jan 15, 2024
9eeff03
Document about mixing series of different dtypes on select/2
philss Jan 15, 2024
138c1ec
Ensure that we are "targeting" the right dtype
philss Jan 15, 2024
f4571d8
Fix `Series.remainder/2` to work with more int dtypes
philss Jan 16, 2024
c7f7585
Fix "DF.dummies/2" to use :u8 columns instead of :s64
philss Jan 16, 2024
a2deaf2
Refactor to use "right" int types in Series' min/max/sum
philss Jan 16, 2024
76bd49f
Use u32 in Series.argsort/2
philss Jan 16, 2024
baccca3
Update lib/explorer/series.ex
philss Jan 16, 2024
566d977
Simplify calculation of out_dtype in arithmetic ops
philss Jan 16, 2024
b802654
Move cast to rust code in "Series.subtract/2"
philss Jan 16, 2024
8ffa3d2
Fix cast_to_divide
philss Jan 16, 2024
d495a53
Simplify rule for subtract unsigned integers
philss Jan 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3732,9 +3732,9 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.dummies(df, "col_x")
#Explorer.DataFrame<
Polars[4 x 3]
col_x_a s64 [1, 0, 1, 0]
col_x_b s64 [0, 1, 0, 0]
col_x_c s64 [0, 0, 0, 1]
col_x_a u8 [1, 0, 1, 0]
col_x_b u8 [0, 1, 0, 0]
col_x_c u8 [0, 0, 0, 1]
>

Or multiple columns:
Expand All @@ -3743,12 +3743,12 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.dummies(df, ["col_x", "col_y"])
#Explorer.DataFrame<
Polars[4 x 6]
col_x_a s64 [1, 0, 1, 0]
col_x_b s64 [0, 1, 0, 0]
col_x_c s64 [0, 0, 0, 1]
col_y_b s64 [1, 0, 1, 0]
col_y_a s64 [0, 1, 0, 0]
col_y_d s64 [0, 0, 0, 1]
col_x_a u8 [1, 0, 1, 0]
col_x_b u8 [0, 1, 0, 0]
col_x_c u8 [0, 0, 0, 1]
col_y_b u8 [1, 0, 1, 0]
col_y_a u8 [0, 1, 0, 0]
col_y_d u8 [0, 0, 0, 1]
>

Or all string columns:
Expand All @@ -3757,9 +3757,9 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.dummies(df, fn _name, type -> type == :string end)
#Explorer.DataFrame<
Polars[4 x 3]
col_y_b s64 [1, 0, 1, 0]
col_y_a s64 [0, 1, 0, 0]
col_y_d s64 [0, 0, 0, 1]
col_y_b u8 [1, 0, 1, 0]
col_y_a u8 [0, 1, 0, 0]
col_y_d u8 [0, 0, 0, 1]
>

Ranges, regexes, and functions are also accepted in column names, as in `select/2`.
Expand All @@ -3779,7 +3779,7 @@ defmodule Explorer.DataFrame do
value <- Series.to_list(Series.distinct(df[column])),
do: column <> "_#{value}"

out_dtypes = for new_column <- out_columns, into: %{}, do: {new_column, {:s, 64}}
out_dtypes = for new_column <- out_columns, into: %{}, do: {new_column, {:u, 8}}

out_df = %{df | groups: [], names: out_columns, dtypes: out_dtypes}
Shared.apply_impl(df, :dummies, [out_df, columns])
Expand Down
24 changes: 20 additions & 4 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,11 @@ defmodule Explorer.PolarsBackend.Series do
def mode(series), do: Shared.apply_series(series, :s_mode)

@impl true
def variance(series, ddof), do: Shared.apply_series(series, :s_variance, [ddof])
def variance(series, ddof), do: series |> Shared.apply_series(:s_variance, [ddof]) |> at(0)

@impl true
def standard_deviation(series, ddof),
do: Shared.apply_series(series, :s_standard_deviation, [ddof])
do: series |> Shared.apply_series(:s_standard_deviation, [ddof]) |> at(0)
philss marked this conversation as resolved.
Show resolved Hide resolved

@impl true
def quantile(series, quantile),
Expand Down Expand Up @@ -271,8 +271,24 @@ defmodule Explorer.PolarsBackend.Series do
do: Shared.apply_series(matching_size!(left, right), :s_add, [right.data])

@impl true
def subtract(_out_dtype, left, right),
do: Shared.apply_series(matching_size!(left, right), :s_subtract, [right.data])
def subtract(out_dtype, left, right) do
left = matching_size!(left, right)

{left, right} =
case {left.dtype, right.dtype} do
{{:s, _}, {:s, _}} ->
{left, right}

{{l, _}, {r, _}}
when l in [:s, :f, :u] and r in [:s, :f, :u] ->
{cast(left, out_dtype), right}

{_, _} ->
{left, right}
end

Shared.apply_series(left, :s_subtract, [right.data])
end

@impl true
def multiply(out_dtype, left, right) do
Expand Down
104 changes: 71 additions & 33 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1234,11 +1234,11 @@ defmodule Explorer.Series do
"""
@doc type: :element_wise
def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: dtype} = categories)
when K.and(K.in(l_dtype, [{:s, 64}, :string]), K.in(dtype, [:string, :category])),
when K.and(K.in(l_dtype, [:string | @integer_types]), K.in(dtype, [:string, :category])),
do: apply_series(series, :categorise, [categories])

def categorise(%Series{dtype: l_dtype} = series, [head | _] = categories)
when K.and(K.in(l_dtype, [{:s, 64}, :string]), is_binary(head)),
when K.and(K.in(l_dtype, [:string | @integer_types]), is_binary(head)),
do: apply_series(series, :categorise, [from_list(categories, dtype: :string)])

# Slice and dice
Expand Down Expand Up @@ -1337,6 +1337,10 @@ defmodule Explorer.Series do

`predicate` must be a boolean series. `on_true` and `on_false` must be
a series of the same size as `predicate` or a series of size 1.

It is possible to mix numeric series in the `on_true` and `on_false`,
and the resultant series will have the dtype of the greater side.
For example, `:u8` and `:s16` is going to result in `:s16` series.
"""
@doc type: :element_wise
@spec select(
Expand Down Expand Up @@ -1840,7 +1844,7 @@ defmodule Explorer.Series do
iex> Explorer.Series.rank(s, method: :ordinal)
#Explorer.Series<
Polars[3]
s64 [1, 2, 3]
u32 [1, 2, 3]
>

iex> s = Explorer.Series.from_list([ ~N[2022-07-07 17:44:13.020548], ~N[2022-07-07 17:43:08.473561], ~N[2022-07-07 17:45:00.116337] ])
Expand Down Expand Up @@ -3167,10 +3171,14 @@ defmodule Explorer.Series do
end
end

# TODO: fix the logic for integer dtypes
defp cast_to_add({:s, left}, {:s, right}), do: {:s, max(left, right)}
defp cast_to_add({:s, _}, {:f, _} = float), do: float
defp cast_to_add({:f, _} = float, {:s, _}), do: float
# Review the size needed for this operation.
defp cast_to_add({int_type, left}, {int_type, right}) when K.in(int_type, [:s, :u]),
do: {int_type, max(left, right)}

defp cast_to_add({:s, s_size}, {:u, u_size}), do: min(64, max(s_size, 2 * u_size))
defp cast_to_add({:u, s_size}, {:s, u_size}), do: min(64, max(s_size, 2 * u_size))
defp cast_to_add({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
defp cast_to_add({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
defp cast_to_add({:f, _}, {:f, _}), do: {:f, 64}
philss marked this conversation as resolved.
Show resolved Hide resolved
defp cast_to_add(:date, {:duration, _}), do: :date
defp cast_to_add({:duration, _}, :date), do: :date
Expand Down Expand Up @@ -3234,10 +3242,21 @@ defmodule Explorer.Series do
end
end

# TODO: fix the logic for new integer dtypes
defp cast_to_subtract({:s, left}, {:s, right}), do: {:s, max(left, right)}
defp cast_to_subtract({:s, _}, {:f, _} = float), do: float
defp cast_to_subtract({:f, _} = float, {:s, _}), do: float
# Review the size needed for this operation.
defp cast_to_subtract({int_type, left}, {int_type, right}) when K.in(int_type, [:s, :u]),
do: {:s, min(64, 2 * max(left, right))}

defp cast_to_subtract({:s, s_size}, {:u, u_size}) when u_size >= s_size,
do: {:s, min(64, u_size * 2)}

defp cast_to_subtract({:u, u_size}, {:s, s_size}) when u_size >= s_size,
do: {:s, min(64, u_size * 2)}

defp cast_to_subtract({:s, s_size}, {:u, _}), do: {:s, s_size}
defp cast_to_subtract({:u, _}, {:s, s_size}), do: {:s, s_size}

defp cast_to_subtract({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
defp cast_to_subtract({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
defp cast_to_subtract({:f, _}, {:f, _}), do: {:f, 64}

defp cast_to_subtract(:date, :date), do: {:duration, :millisecond}
Expand Down Expand Up @@ -3293,11 +3312,22 @@ defmodule Explorer.Series do
end
end

# TODO: fix the logic for new dtypes
defp cast_to_multiply({:s, left}, {:s, right}), do: {:s, max(left, right)}
defp cast_to_multiply({:s, _}, {:f, _} = float), do: float
defp cast_to_multiply({:f, _} = float, {:s, _}), do: float
defp cast_to_multiply({int_type, left}, {int_type, right}) when K.in(int_type, [:s, :u]),
do: {int_type, max(left, right)}

defp cast_to_multiply({:s, s_size}, {:u, u_size}) when u_size >= s_size,
do: {:s, min(64, u_size * 2)}

defp cast_to_multiply({:u, u_size}, {:s, s_size}) when u_size >= s_size,
do: {:s, min(64, u_size * 2)}
philss marked this conversation as resolved.
Show resolved Hide resolved

defp cast_to_multiply({:s, s_size}, {:u, _}), do: {:s, s_size}
defp cast_to_multiply({:u, _}, {:s, s_size}), do: {:s, s_size}

defp cast_to_multiply({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
defp cast_to_multiply({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
defp cast_to_multiply({:f, _}, {:f, _}), do: {:f, 64}

defp cast_to_multiply({:s, _}, {:duration, p}), do: {:duration, p}
defp cast_to_multiply({:duration, p}, {:s, _}), do: {:duration, p}
defp cast_to_multiply({:f, _}, {:duration, p}), do: {:duration, p}
Expand Down Expand Up @@ -3368,10 +3398,12 @@ defmodule Explorer.Series do
end
end

# Fix the logic for new integer dtypes
defp cast_to_divide({:s, _}, {:s, _}), do: {:f, 64}
defp cast_to_divide({:s, _}, {:f, _} = float), do: float
defp cast_to_divide({:f, _} = float, {:s, _}), do: float
# Review the size needed for this operation.
defp cast_to_divide({int_type, _}, {int_type, _}) when K.in(int_type, [:s, :u]), do: {:f, 64}
defp cast_to_divide({:s, _}, {:u, _}), do: {:f, 64}
defp cast_to_divide({:u, _}, {:s, _}), do: {:f, 64}
defp cast_to_divide({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
defp cast_to_divide({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
defp cast_to_divide({:f, _}, {:f, _}), do: {:f, 64}
defp cast_to_divide({:duration, p}, {:s, _}), do: {:duration, p}
defp cast_to_divide({:duration, p}, {:f, _}), do: {:duration, p}
Expand Down Expand Up @@ -3534,14 +3566,17 @@ defmodule Explorer.Series do
"""
@doc type: :element_wise
@spec quotient(left :: Series.t(), right :: Series.t() | integer()) :: Series.t()
def quotient(%Series{dtype: {:s, 64}} = left, %Series{dtype: {:s, 64}} = right),
do: apply_series_list(:quotient, [left, right])
def quotient(%Series{dtype: l_dtype} = left, %Series{dtype: r_dtype} = right)
when K.and(K.in(l_dtype, @integer_types), K.in(r_dtype, @integer_types)),
do: apply_series_list(:quotient, [left, right])

def quotient(%Series{dtype: {:s, 64}} = left, right) when is_integer(right),
do: apply_series_list(:quotient, [left, from_list([right])])
def quotient(%Series{dtype: l_dtype} = left, right)
when K.and(K.in(l_dtype, @integer_types), is_integer(right)),
do: apply_series_list(:quotient, [left, from_list([right])])

def quotient(left, %Series{dtype: {:s, 64}} = right) when is_integer(left),
do: apply_series_list(:quotient, [from_list([left]), right])
def quotient(left, %Series{dtype: r_dtype} = right)
when K.and(K.in(r_dtype, @integer_types), is_integer(left)),
do: apply_series_list(:quotient, [from_list([left]), right])

@doc """
Computes the remainder of an element-wise integer division.
Expand Down Expand Up @@ -3584,14 +3619,17 @@ defmodule Explorer.Series do
"""
@doc type: :element_wise
@spec remainder(left :: Series.t(), right :: Series.t() | integer()) :: Series.t()
def remainder(%Series{dtype: {:s, 64}} = left, %Series{dtype: {:s, 64}} = right),
do: apply_series_list(:remainder, [left, right])
def remainder(%Series{dtype: l_dtype} = left, %Series{dtype: r_dtype} = right)
when K.and(K.in(l_dtype, @integer_types), K.in(r_dtype, @integer_types)),
do: apply_series_list(:remainder, [left, right])

def remainder(%Series{dtype: {:s, 64}} = left, right) when is_integer(right),
do: apply_series_list(:remainder, [left, from_list([right])])
def remainder(%Series{dtype: l_dtype} = left, right)
when K.and(K.in(l_dtype, @integer_types), is_integer(right)),
do: apply_series_list(:remainder, [left, from_list([right])])

def remainder(left, %Series{dtype: {:s, 64}} = right) when is_integer(left),
do: apply_series_list(:remainder, [from_list([left]), right])
def remainder(left, %Series{dtype: r_dtype} = right)
when K.and(K.in(r_dtype, @integer_types), is_integer(left)),
do: apply_series_list(:remainder, [from_list([left]), right])

@doc """
Computes the the sine of a number (in radians).
Expand Down Expand Up @@ -4401,14 +4439,14 @@ defmodule Explorer.Series do
iex> Explorer.Series.argsort(s)
#Explorer.Series<
Polars[4]
s64 [3, 1, 2, 0]
u32 [3, 1, 2, 0]
>

iex> s = Explorer.Series.from_list([9, 3, 7, 1])
iex> Explorer.Series.argsort(s, direction: :desc)
#Explorer.Series<
Polars[4]
s64 [0, 2, 1, 3]
u32 [0, 2, 1, 3]
>

"""
Expand Down
7 changes: 2 additions & 5 deletions native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -477,11 +477,8 @@ pub fn df_to_dummies(df: ExDataFrame, selection: Vec<&str>) -> Result<ExDataFram
let dummies = df
.select(selection)
.and_then(|df| df.to_dummies(None, drop_first))?;
let series = dummies
.iter()
.map(|series| series.cast(&DataType::Int64).unwrap())
.collect();
Ok(ExDataFrame::new(DataFrame::new(series)?))

Ok(ExDataFrame::new(dummies))
}

#[rustler::nif(schedule = "DirtyCpu")]
Expand Down
Loading
Loading