Merge pull request #6906 from janezd/groupby-keep-variables

GroupBy: Avoid guessing variable types
biolab · Oct 12, 2024 · 9497b39 · 9497b39
2 parents c6a79f6 + c9edef4
commit 9497b39
Show file tree

Hide file tree

Showing 7 changed files with 254 additions and 83 deletions.
diff --git a/Orange/data/aggregate.py b/Orange/data/aggregate.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Callable, Dict, List, Tuple, Union
+from typing import Callable, Dict, List, Tuple, Union, Type
 
 import pandas as pd
 
@@ -39,15 +39,20 @@ def __init__(self, table: Table, by: List[Variable]):
  df = table_to_frame(table, include_metas=True)
  # observed=True keeps only groups with at leas one instance
  self.group_by = df.groupby([a.name for a in by], observed=True)
+ self.by = tuple(by)
 
  # lru_cache that is caches on the object level
  self.compute_aggregation = lru_cache()(self._compute_aggregation)
 
+ AggDescType = Union[str,
+ Callable,
+ Tuple[str, Union[str, Callable]],
+ Tuple[str, Union[str, Callable], Union[Type[Variable], bool]]
+ ]
+
  def aggregate(
  self,
- aggregations: Dict[
- Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]]
- ],
+ aggregations: Dict[Variable, List[AggDescType]],
  callback: Callable = dummy_callback,
  ) -> Table:
  """
@@ -57,12 +62,16 @@ def aggregate(
  ----------
  aggregations
  The dictionary that defines aggregations that need to be computed
- for variables. We support two formats:
+ for variables. We support three formats:
  - {variable name: [agg function 1, agg function 2]}
  - {variable name: [(agg name 1, agg function 1), (agg name 1, agg function 1)]}
+ - {variable name: [(agg name 1, agg function 1, output_variable_type1), ...]}
  Where agg name is the aggregation name used in the output column name.
  Aggregation function can be either function or string that defines
  aggregation in Pandas (e.g. mean).
+ output_variable_type can be a type for a new variable, True to copy
+ the input variable, or False to create a new variable of the same type
+ as the input
  callback
  Callback function to report the progress
 
@@ -75,29 +84,44 @@ def aggregate(
  count = 0
 
  result_agg = []
+ output_variables = []
  for col, aggs in aggregations.items():
  for agg in aggs:
- res = self._compute_aggregation(col, agg)
+ res, var = self._compute_aggregation(col, agg)
  result_agg.append(res)
+ output_variables.append(var)
  count += 1
  callback(count / num_aggs * 0.8)
 
- agg_table = self._aggregations_to_table(result_agg)
+ agg_table = self._aggregations_to_table(result_agg, output_variables)
  callback(1)
  return agg_table
 
  def _compute_aggregation(
- self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]]
- ) -> pd.Series:
+ self, col: Variable, agg: AggDescType) -> Tuple[pd.Series, Variable]:
  # use named aggregation to avoid issues with same column names when reset_index
  if isinstance(agg, tuple):
- name, agg = agg
+ name, agg, var_type, *_ = (*agg, None)
  else:
  name = agg if isinstance(agg, str) else agg.__name__
+ var_type = None
  col_name = f"{col.name} - {name}"
- return self.group_by[col.name].agg(**{col_name: agg})
-
- def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
+ agg_col = self.group_by[col.name].agg(**{col_name: agg})
+ if var_type is True:
+ var = col.copy(name=col_name)
+ elif var_type is False:
+ var = col.make(name=col_name)
+ elif var_type is None:
+ var = None
+ else:
+ assert issubclass(var_type, Variable)
+ var = var_type.make(name=col_name)
+ return agg_col, var
+
+ def _aggregations_to_table(
+ self,
+ aggregations: List[pd.Series],
+ output_variables: List[Union[Variable, None]]) -> Table:
  """Concatenate aggregation series and convert back to Table"""
  if aggregations:
  df = pd.concat(aggregations, axis=1)
@@ -107,7 +131,7 @@ def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
  df = df.drop(columns=df.columns)
  gb_attributes = df.index.names
  df = df.reset_index() # move group by var that are in index to columns
- table = table_from_frame(df)
+ table = table_from_frame(df, variables=(*self.by, *output_variables))
 
  # group by variables should be last two columns in metas in the output
  metas = table.domain.metas

diff --git a/Orange/data/pandas_compat.py b/Orange/data/pandas_compat.py
@@ -1,5 +1,6 @@
 """Pandas DataFrame↔Table conversion helpers"""
 from functools import partial
+from itertools import zip_longest
 
 import numpy as np
 from scipy import sparse as sp
@@ -255,7 +256,14 @@ def to_categorical(s, _):
  return np.asarray(x)
 
 
-def vars_from_df(df, role=None, force_nominal=False):
+def to_numeric(s, _):
+ return np.asarray(pd.to_numeric(s))
+
+
+def vars_from_df(df, role=None, force_nominal=False, variables=None):
+ if variables is not None:
+ assert len(variables) == len(df.columns)
+
  if role is None and hasattr(df, 'orange_role'):
  role = df.orange_role
  df = _reset_index(df)
@@ -264,39 +272,52 @@ def vars_from_df(df, role=None, force_nominal=False):
  exprs = [], [], []
  vars_ = [], [], []
 
- for column in df.columns:
+ def _convert_string(s, _):
+ return np.asarray(
+ # to object so that fillna can replace with nans if Unknown in nan
+ # replace nan with object Unknown assure that all values are string
+ s.astype(object).fillna(StringVariable.Unknown).astype(str),
+ dtype=object
+ )
+
+ conversions = {
+ DiscreteVariable: to_categorical,
+ ContinuousVariable: to_numeric,
+ TimeVariable: _convert_datetime,
+ StringVariable: _convert_string
+ }
+
+ for column, var in zip_longest(df.columns, variables or [], fillvalue=None):
  s = df[column]
  _role = Role.Attribute if role is None else role
- if hasattr(df, 'orange_variables') and column in df.orange_variables:
+ if var is not None:
+ if not var.is_primitive():
+ _role = Role.Meta
+ expr = conversions[type(var)]
+ elif hasattr(df, 'orange_variables') and column in df.orange_variables:
  original_var = df.orange_variables[column]
  var = original_var.copy(compute_value=None)
  expr = None
- elif _is_datetime(s):
- var = TimeVariable(str(column))
- expr = _convert_datetime
- elif _is_discrete(s, force_nominal):
- discrete = s.astype("category").cat
- var = DiscreteVariable(
- str(column), discrete.categories.astype(str).tolist()
- )
- expr = to_categorical
- elif is_numeric_dtype(s):
- var = ContinuousVariable(
- # set number of decimals to 0 if int else keeps default behaviour
- str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
- )
- expr = None
  else:
- if role is not None and role != Role.Meta:
- raise ValueError("String variable must be in metas.")
- _role = Role.Meta
- var = StringVariable(str(column))
- expr = lambda s, _: np.asarray(
- # to object so that fillna can replace with nans if Unknown in nan
- # replace nan with object Unknown assure that all values are string
- s.astype(object).fillna(StringVariable.Unknown).astype(str),
- dtype=object
- )
+ if _is_datetime(s):
+ var = TimeVariable(str(column))
+ elif _is_discrete(s, force_nominal):
+ discrete = s.astype("category").cat
+ var = DiscreteVariable(
+ str(column), discrete.categories.astype(str).tolist()
+ )
+ elif is_numeric_dtype(s):
+ var = ContinuousVariable(
+ # set number of decimals to 0 if int else keeps default behaviour
+ str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
+ )
+ else:
+ if role is not None and role != Role.Meta:
+ raise ValueError("String variable must be in metas.")
+ _role = Role.Meta
+ var = StringVariable(str(column))
+ expr = conversions[type(var)]
+
 
  cols[_role].append(column)
  exprs[_role].append(expr)
@@ -330,8 +351,10 @@ def vars_from_df(df, role=None, force_nominal=False):
  return xym, Domain(*vars_)
 
 
-def table_from_frame(df, *, force_nominal=False):
- XYM, domain = vars_from_df(df, force_nominal=force_nominal)
+def table_from_frame(df, *, force_nominal=False, variables=None):
+ XYM, domain = vars_from_df(df,
+ force_nominal=force_nominal,
+ variables=variables)
 
  if hasattr(df, 'orange_weights') and hasattr(df, 'orange_attributes'):
  W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]

diff --git a/Orange/data/tests/test_aggregate.py b/Orange/data/tests/test_aggregate.py
@@ -1,4 +1,5 @@
 import unittest
+from unittest.mock import Mock
 
 import numpy as np
 import pandas as pd
@@ -132,13 +133,69 @@ def test_aggregation(self):
  def test_preserve_table_class(self):
  """
  Test whether result table has the same type than the imnput table,
- e.g. if input table corpus the resutlitn table must be corpus too.
+ e.g. if input table corpus the resulting table must be corpus too.
  """
  data = AlternativeTable.from_table(self.data.domain, self.data)
  gb = data.groupby([data.domain["a"]])
  output = gb.aggregate({data.domain["a"]: ["mean"]})
  self.assertIsInstance(output, AlternativeTable)
 
+ def test_preserve_variables(self):
+ a, _, _, dvar = self.data.domain.attributes
+ gb = self.data.groupby([a])
+
+ a.attributes = {"foo": "bar"}
+ dvar.attributes = {"foo": "baz"}
+
+ a.copy = Mock(side_effect=a.copy)
+ a.make = Mock(side_effect=a.make)
+
+ def f(*_):
+ return 0
+
+ output = gb.aggregate(
+ {a: [("copy", f, True),
+ ("make", f, False),
+ ("auto", f, None),
+ ("string", f, StringVariable),
+ ("number", f, ContinuousVariable)],
+ dvar: [("copy", f, True),
+ ("make", f, False),
+ ("auto", f, None),
+ ("string", f, StringVariable),
+ ("discrete", f, DiscreteVariable)]}
+ )
+ self.assertIsInstance(output.domain["a - copy"], ContinuousVariable)
+ a.copy.assert_called_once()
+ self.assertEqual(output.domain["a - copy"].attributes, {"foo": "bar"})
+
+ self.assertIsInstance(output.domain["a - make"], ContinuousVariable)
+ a.make.assert_called_once()
+ self.assertNotEqual(output.domain["a - make"].attributes, {"foo": "bar"})
+
+ self.assertIsInstance(output.domain["a - auto"], ContinuousVariable)
+ self.assertNotEqual(output.domain["a - auto"].attributes, {"foo": "bar"})
+
+ self.assertIsInstance(output.domain["a - string"], StringVariable)
+
+ self.assertIsInstance(output.domain["a - number"], ContinuousVariable)
+ self.assertNotEqual(output.domain["a - number"].attributes, {"foo": "bar"})
+
+ self.assertIsInstance(output.domain["dvar - copy"], DiscreteVariable)
+ self.assertEqual(output.domain["dvar - copy"].attributes, {"foo": "baz"})
+
+ self.assertIsInstance(output.domain["dvar - make"], DiscreteVariable)
+ self.assertNotEqual(output.domain["dvar - make"].attributes, {"foo": "baz"})
+
+ # f returns 0, so the column looks numeric! Let's test that it is
+ # converted to numeric.
+ self.assertIsInstance(output.domain["dvar - auto"], ContinuousVariable)
+
+ self.assertIsInstance(output.domain["dvar - string"], StringVariable)
+
+ self.assertIsInstance(output.domain["dvar - discrete"], DiscreteVariable)
+ self.assertNotEqual(output.domain["dvar - discrete"].attributes, {"foo": "baz"})
+
 
 if __name__ == "__main__":
  unittest.main()
diff --git a/Orange/data/tests/test_pandas.py → Orange/data/tests/test_pandas_compat.py b/Orange/data/tests/test_pandas.py → Orange/data/tests/test_pandas_compat.py
@@ -56,6 +56,24 @@ def test_table_from_frame(self):
  self.assertEqual(names, ['0', '1', '2'])
  self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
 
+ # Specify (some) variables
+ dvar = DiscreteVariable('x', values=tuple("dacb"))
+ cvar = ContinuousVariable('y')
+ table = table_from_frame(df, variables=[dvar, cvar, None])
+ self.assertIs(table.domain[0], dvar)
+ self.assertIs(table.domain[1], cvar)
+ self.assertIsInstance(table.domain[2], TimeVariable)
+
+ table = table_from_frame(df,
+ variables=[None, None, None],
+ force_nominal=True)
+ self.assertIsInstance(table.domain[0], DiscreteVariable)
+ self.assertIsInstance(table.domain[1], ContinuousVariable)
+ self.assertIsInstance(table.domain[2], TimeVariable)
+
+ self.assertRaises(AssertionError,
+ table_from_frame, df, variables=[None, None])
+
  # Include index
  df.index = list('abaa')
  table = table_from_frame(df)