Skip to content

Commit

Permalink
Merge pull request #6906 from janezd/groupby-keep-variables
Browse files Browse the repository at this point in the history
GroupBy: Avoid guessing variable types
  • Loading branch information
janezd authored Oct 12, 2024
2 parents c6a79f6 + c9edef4 commit 9497b39
Show file tree
Hide file tree
Showing 7 changed files with 254 additions and 83 deletions.
52 changes: 38 additions & 14 deletions Orange/data/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from functools import lru_cache
from typing import Callable, Dict, List, Tuple, Union
from typing import Callable, Dict, List, Tuple, Union, Type

import pandas as pd

Expand Down Expand Up @@ -39,15 +39,20 @@ def __init__(self, table: Table, by: List[Variable]):
df = table_to_frame(table, include_metas=True)
# observed=True keeps only groups with at leas one instance
self.group_by = df.groupby([a.name for a in by], observed=True)
self.by = tuple(by)

# lru_cache that is caches on the object level
self.compute_aggregation = lru_cache()(self._compute_aggregation)

AggDescType = Union[str,
Callable,
Tuple[str, Union[str, Callable]],
Tuple[str, Union[str, Callable], Union[Type[Variable], bool]]
]

def aggregate(
self,
aggregations: Dict[
Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]]
],
aggregations: Dict[Variable, List[AggDescType]],
callback: Callable = dummy_callback,
) -> Table:
"""
Expand All @@ -57,12 +62,16 @@ def aggregate(
----------
aggregations
The dictionary that defines aggregations that need to be computed
for variables. We support two formats:
for variables. We support three formats:
- {variable name: [agg function 1, agg function 2]}
- {variable name: [(agg name 1, agg function 1), (agg name 1, agg function 1)]}
- {variable name: [(agg name 1, agg function 1, output_variable_type1), ...]}
Where agg name is the aggregation name used in the output column name.
Aggregation function can be either function or string that defines
aggregation in Pandas (e.g. mean).
output_variable_type can be a type for a new variable, True to copy
the input variable, or False to create a new variable of the same type
as the input
callback
Callback function to report the progress
Expand All @@ -75,29 +84,44 @@ def aggregate(
count = 0

result_agg = []
output_variables = []
for col, aggs in aggregations.items():
for agg in aggs:
res = self._compute_aggregation(col, agg)
res, var = self._compute_aggregation(col, agg)
result_agg.append(res)
output_variables.append(var)
count += 1
callback(count / num_aggs * 0.8)

agg_table = self._aggregations_to_table(result_agg)
agg_table = self._aggregations_to_table(result_agg, output_variables)
callback(1)
return agg_table

def _compute_aggregation(
self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]]
) -> pd.Series:
self, col: Variable, agg: AggDescType) -> Tuple[pd.Series, Variable]:
# use named aggregation to avoid issues with same column names when reset_index
if isinstance(agg, tuple):
name, agg = agg
name, agg, var_type, *_ = (*agg, None)
else:
name = agg if isinstance(agg, str) else agg.__name__
var_type = None
col_name = f"{col.name} - {name}"
return self.group_by[col.name].agg(**{col_name: agg})

def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
agg_col = self.group_by[col.name].agg(**{col_name: agg})
if var_type is True:
var = col.copy(name=col_name)
elif var_type is False:
var = col.make(name=col_name)
elif var_type is None:
var = None
else:
assert issubclass(var_type, Variable)
var = var_type.make(name=col_name)
return agg_col, var

def _aggregations_to_table(
self,
aggregations: List[pd.Series],
output_variables: List[Union[Variable, None]]) -> Table:
"""Concatenate aggregation series and convert back to Table"""
if aggregations:
df = pd.concat(aggregations, axis=1)
Expand All @@ -107,7 +131,7 @@ def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
df = df.drop(columns=df.columns)
gb_attributes = df.index.names
df = df.reset_index() # move group by var that are in index to columns
table = table_from_frame(df)
table = table_from_frame(df, variables=(*self.by, *output_variables))

# group by variables should be last two columns in metas in the output
metas = table.domain.metas
Expand Down
83 changes: 53 additions & 30 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Pandas DataFrame↔Table conversion helpers"""
from functools import partial
from itertools import zip_longest

import numpy as np
from scipy import sparse as sp
Expand Down Expand Up @@ -255,7 +256,14 @@ def to_categorical(s, _):
return np.asarray(x)


def vars_from_df(df, role=None, force_nominal=False):
def to_numeric(s, _):
return np.asarray(pd.to_numeric(s))


def vars_from_df(df, role=None, force_nominal=False, variables=None):
if variables is not None:
assert len(variables) == len(df.columns)

if role is None and hasattr(df, 'orange_role'):
role = df.orange_role
df = _reset_index(df)
Expand All @@ -264,39 +272,52 @@ def vars_from_df(df, role=None, force_nominal=False):
exprs = [], [], []
vars_ = [], [], []

for column in df.columns:
def _convert_string(s, _):
return np.asarray(
# to object so that fillna can replace with nans if Unknown in nan
# replace nan with object Unknown assure that all values are string
s.astype(object).fillna(StringVariable.Unknown).astype(str),
dtype=object
)

conversions = {
DiscreteVariable: to_categorical,
ContinuousVariable: to_numeric,
TimeVariable: _convert_datetime,
StringVariable: _convert_string
}

for column, var in zip_longest(df.columns, variables or [], fillvalue=None):
s = df[column]
_role = Role.Attribute if role is None else role
if hasattr(df, 'orange_variables') and column in df.orange_variables:
if var is not None:
if not var.is_primitive():
_role = Role.Meta
expr = conversions[type(var)]
elif hasattr(df, 'orange_variables') and column in df.orange_variables:
original_var = df.orange_variables[column]
var = original_var.copy(compute_value=None)
expr = None
elif _is_datetime(s):
var = TimeVariable(str(column))
expr = _convert_datetime
elif _is_discrete(s, force_nominal):
discrete = s.astype("category").cat
var = DiscreteVariable(
str(column), discrete.categories.astype(str).tolist()
)
expr = to_categorical
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
)
expr = None
else:
if role is not None and role != Role.Meta:
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
expr = lambda s, _: np.asarray(
# to object so that fillna can replace with nans if Unknown in nan
# replace nan with object Unknown assure that all values are string
s.astype(object).fillna(StringVariable.Unknown).astype(str),
dtype=object
)
if _is_datetime(s):
var = TimeVariable(str(column))
elif _is_discrete(s, force_nominal):
discrete = s.astype("category").cat
var = DiscreteVariable(
str(column), discrete.categories.astype(str).tolist()
)
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
)
else:
if role is not None and role != Role.Meta:
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
expr = conversions[type(var)]


cols[_role].append(column)
exprs[_role].append(expr)
Expand Down Expand Up @@ -330,8 +351,10 @@ def vars_from_df(df, role=None, force_nominal=False):
return xym, Domain(*vars_)


def table_from_frame(df, *, force_nominal=False):
XYM, domain = vars_from_df(df, force_nominal=force_nominal)
def table_from_frame(df, *, force_nominal=False, variables=None):
XYM, domain = vars_from_df(df,
force_nominal=force_nominal,
variables=variables)

if hasattr(df, 'orange_weights') and hasattr(df, 'orange_attributes'):
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
Expand Down
59 changes: 58 additions & 1 deletion Orange/data/tests/test_aggregate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from unittest.mock import Mock

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -132,13 +133,69 @@ def test_aggregation(self):
def test_preserve_table_class(self):
"""
Test whether result table has the same type than the imnput table,
e.g. if input table corpus the resutlitn table must be corpus too.
e.g. if input table corpus the resulting table must be corpus too.
"""
data = AlternativeTable.from_table(self.data.domain, self.data)
gb = data.groupby([data.domain["a"]])
output = gb.aggregate({data.domain["a"]: ["mean"]})
self.assertIsInstance(output, AlternativeTable)

def test_preserve_variables(self):
a, _, _, dvar = self.data.domain.attributes
gb = self.data.groupby([a])

a.attributes = {"foo": "bar"}
dvar.attributes = {"foo": "baz"}

a.copy = Mock(side_effect=a.copy)
a.make = Mock(side_effect=a.make)

def f(*_):
return 0

output = gb.aggregate(
{a: [("copy", f, True),
("make", f, False),
("auto", f, None),
("string", f, StringVariable),
("number", f, ContinuousVariable)],
dvar: [("copy", f, True),
("make", f, False),
("auto", f, None),
("string", f, StringVariable),
("discrete", f, DiscreteVariable)]}
)
self.assertIsInstance(output.domain["a - copy"], ContinuousVariable)
a.copy.assert_called_once()
self.assertEqual(output.domain["a - copy"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["a - make"], ContinuousVariable)
a.make.assert_called_once()
self.assertNotEqual(output.domain["a - make"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["a - auto"], ContinuousVariable)
self.assertNotEqual(output.domain["a - auto"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["a - string"], StringVariable)

self.assertIsInstance(output.domain["a - number"], ContinuousVariable)
self.assertNotEqual(output.domain["a - number"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["dvar - copy"], DiscreteVariable)
self.assertEqual(output.domain["dvar - copy"].attributes, {"foo": "baz"})

self.assertIsInstance(output.domain["dvar - make"], DiscreteVariable)
self.assertNotEqual(output.domain["dvar - make"].attributes, {"foo": "baz"})

# f returns 0, so the column looks numeric! Let's test that it is
# converted to numeric.
self.assertIsInstance(output.domain["dvar - auto"], ContinuousVariable)

self.assertIsInstance(output.domain["dvar - string"], StringVariable)

self.assertIsInstance(output.domain["dvar - discrete"], DiscreteVariable)
self.assertNotEqual(output.domain["dvar - discrete"].attributes, {"foo": "baz"})


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,24 @@ def test_table_from_frame(self):
self.assertEqual(names, ['0', '1', '2'])
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])

# Specify (some) variables
dvar = DiscreteVariable('x', values=tuple("dacb"))
cvar = ContinuousVariable('y')
table = table_from_frame(df, variables=[dvar, cvar, None])
self.assertIs(table.domain[0], dvar)
self.assertIs(table.domain[1], cvar)
self.assertIsInstance(table.domain[2], TimeVariable)

table = table_from_frame(df,
variables=[None, None, None],
force_nominal=True)
self.assertIsInstance(table.domain[0], DiscreteVariable)
self.assertIsInstance(table.domain[1], ContinuousVariable)
self.assertIsInstance(table.domain[2], TimeVariable)

self.assertRaises(AssertionError,
table_from_frame, df, variables=[None, None])

# Include index
df.index = list('abaa')
table = table_from_frame(df)
Expand Down
Loading

0 comments on commit 9497b39

Please sign in to comment.