diff --git a/CHANGELOG.md b/CHANGELOG.md index 889c2cc48..8f4c663ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## [Unreleased] - [ENH] Added support for pd.Series.select - Issue #1394 @samukweku +- [ENH] Added suport for janitor.mutate - Issue #1226 @samukweku ## [v0.30.0] - 2024-12-04 diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index 6e0f14fc6..dfd755de2 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -57,6 +57,7 @@ from .limit_column_characters import limit_column_characters from .min_max_scale import min_max_scale from .move import move +from .mutate import mutate from .pivot import ( pivot_longer, pivot_longer_spec, @@ -140,6 +141,7 @@ "limit_column_characters", "min_max_scale", "move", + "mutate", "pivot_longer", "pivot_longer_spec", "pivot_wider", diff --git a/janitor/functions/mutate.py b/janitor/functions/mutate.py new file mode 100644 index 000000000..ac1612e1f --- /dev/null +++ b/janitor/functions/mutate.py @@ -0,0 +1,269 @@ +"""Implementation of mutate.""" + +from __future__ import annotations + +from functools import singledispatch +from typing import Any + +import pandas as pd +import pandas_flavor as pf +from pandas.api.types import is_scalar +from pandas.core.common import apply_if_callable +from pandas.core.groupby.generic import DataFrameGroupBy + +from janitor.functions.select import get_index_labels +from janitor.utils import check + + +@pf.register_dataframe_method +def mutate( + df: pd.DataFrame, + *args: tuple[dict | tuple], + by: Any = None, + copy: bool = True, +) -> pd.DataFrame: + """ + + !!! info "New in version 0.31.0" + + !!!note + + Before reaching for `mutate`, try `pd.DataFrame.assign`. + + mutate creates new columns that are functions of existing columns. + It can also modify columns (if the name is the same as an existing column). + + The argument provided to *args* should be either a dictionary, a tuple or a callable. + + - **dictionary argument**: + If the argument is a dictionary, + the value in the `{key:value}` pairing + should be either a string, a callable or a tuple. + + - If the value in the dictionary + is a string or a callable, + the key of the dictionary + should be an existing column name. + + !!!note + + - If the value is a string, + the string should be a pandas string function, + e.g "sum", "mean", etc. + + - If the value of the dictionary is a tuple, + it should be of length 2, and of the form + `(column_name, mutation_func)`, + where `column_name` should exist in the DataFrame, + and `mutation_func` should be either a string or a callable. + The key in the dictionary can be a new column name. + + !!!note + + - If `mutation_func` is a string, + the string should be a pandas string function, + e.g "sum", "mean", etc. + + + + - **tuple argument**: + If the argument is a tuple, it should be of length 2, + and of the form + `(column_name, mutation_func)`, + where `column_name` should exist in the DataFrame, + and `mutation_func` should be either a string or a callable. + + !!!note + + - if `mutation_func` is a string, + the string should be a pandas string function, + e.g "sum", "mean", etc. + + !!!note + + - `column_name` can be anything supported by the + [`select`][janitor.functions.select.select] syntax; + as such multiple columns can be processed here - + they will be processed individually. + + + + - **callable argument**: + If the argument is a callable, the callable is applied + on the DataFrame or GroupBy object. + The result from the callable should be a pandas Series + or DataFrame. + + `by` can be a `DataFrameGroupBy` object; it is assumed that + `by` was created from `df` - the onus is on the user to + ensure that, or the aggregations may yield incorrect results. + + `by` accepts anything supported by `pd.DataFrame.groupby`. + + Arguments supported in `pd.DataFrame.groupby` + can also be passed to `by` via a dictionary. + + Mutation does not occur on the original DataFrame; + change this behaviour by passing `copy=False`. + + Examples: + >>> import pandas as pd + >>> import numpy as np + >>> import janitor + >>> df = pd.DataFrame({ + ... "col1": [5, 10, 15], + ... "col2": [3, 6, 9], + ... "col3": [10, 100, 1_000], + ... }) + + Transformation via a dictionary: + >>> df.mutate( + ... {"col4": ('col1',np.log10), + ... "col1": np.log10} + ... ) + col1 col2 col3 col4 + 0 0.698970 3 10 0.698970 + 1 1.000000 6 100 1.000000 + 2 1.176091 9 1000 1.176091 + + Transformation via a tuple: + >>> df.mutate(("col1", np.log10)) + col1 col2 col3 + 0 0.698970 3 10 + 1 1.000000 6 100 + 2 1.176091 9 1000 + >>> df.mutate(("col*", np.log10)) + col1 col2 col3 + 0 0.698970 0.477121 1.0 + 1 1.000000 0.778151 2.0 + 2 1.176091 0.954243 3.0 + + Transformation via a callable: + >>> df.mutate(lambda df: df.sum(axis=1).rename('total')) + col1 col2 col3 total + 0 5 3 10 18 + 1 10 6 100 116 + 2 15 9 1000 1024 + + Transformation in the presence of a groupby: + >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4], + ... 'avg_run': [3, 4, 1, 3, 2, 4], + ... 'combine_id': [100200, 100200, + ... 101200, 101200, + ... 102201, 103202]} + >>> df = pd.DataFrame(data) + >>> df.mutate({"avg_run_2":("avg_run","mean")}, by='combine_id') + avg_jump avg_run combine_id avg_run_2 + 0 3 3 100200 3.5 + 1 4 4 100200 3.5 + 2 1 1 101200 2.0 + 3 2 3 101200 2.0 + 4 3 2 102201 2.0 + 5 4 4 103202 4.0 + + Args: + df: A pandas DataFrame. + args: Either a dictionary or a tuple. + by: Column(s) to group by. + + Raises: + ValueError: If a tuple is passed and the length is not 2. + + Returns: + A pandas DataFrame or Series with aggregated columns. + """ # noqa: E501 + check("copy", copy, [bool]) + if by is not None: + if isinstance(by, DataFrameGroupBy): + # it is assumed that by is created from df + # onus is on user to ensure that + pass + elif isinstance(by, dict): + by = df.groupby(**by) + else: + if is_scalar(by): + by = [by] + by = df.groupby(by, sort=False, observed=True) + if copy: + df = df.copy(deep=None) + for arg in args: + df = _mutator(arg, df=df, by=by) + return df + + +@singledispatch +def _mutator(arg, df, by): + if not callable(arg): + raise NotImplementedError( + f"janitor.mutate is not supported for {type(arg)}" + ) + if by is None: + val = df + else: + val = by + outcome = _process_maybe_callable(func=arg, obj=val) + if isinstance(outcome, pd.Series): + if not outcome.name: + raise ValueError("Ensure the pandas Series object has a name") + df[outcome.name] = outcome + return df + if isinstance(outcome, pd.DataFrame): + for column in outcome: + df[column] = outcome[column] + return df + raise TypeError( + "The output from a callable should be a named Series or a DataFrame" + ) + + +@_mutator.register(dict) +def _(arg, df, by): + """Dispatch function for dictionary""" + if by is None: + val = df + else: + val = by + for column_name, mutator in arg.items(): + if isinstance(mutator, tuple): + column, func = mutator + column = _process_within_dict(mutator=func, obj=val[column]) + else: + column = _process_within_dict( + mutator=mutator, obj=val[column_name] + ) + df[column_name] = column + return df + + +@_mutator.register(tuple) +def _(arg, df, by): + """Dispatch function for tuple""" + if len(arg) != 2: + raise ValueError("the tuple has to be a length of 2") + column_names, mutator = arg + column_names = get_index_labels(arg=[column_names], df=df, axis="columns") + mapping = {column_name: mutator for column_name in column_names} + return _mutator(mapping, df=df, by=by) + + +def _process_maybe_callable(func: callable, obj): + """Function to handle callables""" + try: + column = obj.transform(func) + except: # noqa: E722 + column = apply_if_callable(maybe_callable=func, obj=obj) + return column + + +def _process_maybe_string(func: str, obj): + """Function to handle pandas string functions""" + # treat as a pandas approved string function + # https://pandas.pydata.org/docs/user_guide/groupby.html#built-in-aggregation-methods + return obj.transform(func) + + +def _process_within_dict(mutator, obj): + """Handle str/callables within a dictionary""" + if isinstance(mutator, str): + return _process_maybe_string(func=mutator, obj=obj) + return _process_maybe_callable(func=mutator, obj=obj) diff --git a/mkdocs/api/functions.md b/mkdocs/api/functions.md index 445f17328..fba17cd03 100644 --- a/mkdocs/api/functions.md +++ b/mkdocs/api/functions.md @@ -43,6 +43,7 @@ - limit_column_characters - min_max_scale - move + - mutate - pivot - process_text - remove_columns diff --git a/tests/functions/test_mutate.py b/tests/functions/test_mutate.py new file mode 100644 index 000000000..f3a4ec2ff --- /dev/null +++ b/tests/functions/test_mutate.py @@ -0,0 +1,184 @@ +import numpy as np +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + + +@pytest.fixture +def df_mutate(): + data = { + "avg_jump": [3, 4, 1, 2, 3, 4], + "avg_run": [3, 4, 1, 3, 2, 4], + "combine_id": [100200, 100200, 101200, 101200, 102201, 103202], + } + return pd.DataFrame(data) + + +def test_mutate_callable_dataframe(df_mutate): + """Test output for callable""" + expected = df_mutate.mutate(lambda df: df.add(1)) + actual = df_mutate.add(1) + assert_frame_equal(actual, expected) + + +def test_mutate_callable_series(df_mutate): + """Test output for callable""" + expected = df_mutate.mutate(lambda df: df.sum(axis=1).rename("new_column")) + actual = df_mutate.assign(new_column=lambda df: df.sum(axis=1)) + assert_frame_equal(actual, expected) + + +def test_mutate_callable_unnamed_series(df_mutate): + """Raise if Series is unnamed""" + with pytest.raises( + ValueError, match="Ensure the pandas Series object has a name" + ): + df_mutate.mutate(lambda df: df.sum(axis=1)) + + +def test_mutate_callable_by_grouped_object(df_mutate): + """Test output for callable""" + grp = df_mutate.groupby("combine_id") + actual = df_mutate.mutate(lambda df: df.avg_run.transform("sum"), by=grp) + expected = df_mutate.assign(avg_run=grp["avg_run"].transform("sum")) + assert_frame_equal(actual, expected) + + +def test_mutate_callable(df_mutate): + "Raise if output of callable is not a pandas Series/DataFrame" + with pytest.raises( + TypeError, + match="The output from a callable should be a named Series or a DataFrame", + ): + df_mutate.mutate(lambda df: np.sum(df["avg_run"])) + + +def test_mutate_check_copy(df_mutate): + """Test copy argument is a boolean""" + with pytest.raises(TypeError, match="copy should be one of.+"): + df_mutate.mutate({"a": "b"}, copy=1) + + +def test_mutate_wrong_arg(df_mutate): + """ + Raise if wrong arg is provided + """ + with pytest.raises( + NotImplementedError, match="janitor.mutate is not supported for.+" + ): + df_mutate.mutate(1) + + +def test_mutate_dict_df_str(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate({"avg_run": "sqrt"}) + expected = df_mutate.assign(avg_run=df_mutate["avg_run"].transform("sqrt")) + assert_frame_equal(actual, expected) + + +def test_mutate_dict_by_str(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate({"avg_run": "mean"}, by="combine_id") + expected = df_mutate.assign( + avg_run=df_mutate.groupby("combine_id")["avg_run"].transform("mean") + ) + assert_frame_equal(actual, expected) + + +def test_mutate_dict_df_callable(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate({"avg_run": lambda df: df.sum()}) + expected = df_mutate.assign(avg_run=df_mutate["avg_run"].sum()) + assert_frame_equal(actual, expected) + + +def test_mutate_dict_by_callable(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate( + {"avg_run": lambda df: df.sum()}, by="combine_id" + ) + expected = df_mutate.assign( + avg_run=df_mutate.groupby("combine_id")["avg_run"].transform("sum") + ) + assert_frame_equal(actual, expected) + + +def test_mutate_dict_by_transform_callable(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate( + {"avg_run": lambda df: df.transform("sum")}, by="combine_id" + ) + expected = df_mutate.assign( + avg_run=df_mutate.groupby("combine_id")["avg_run"].transform("sum") + ) + assert_frame_equal(actual, expected) + + +def test_mutate_dict_df_tuple(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate({"avg_run_sqrt": ("avg_run", "sqrt")}) + expected = df_mutate.assign( + avg_run_sqrt=df_mutate["avg_run"].transform("sqrt") + ) + assert_frame_equal(actual, expected) + + +def test_mutate_dict_by_tuple(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate( + {"avg_run_mean": ("avg_run", "mean")}, by={"by": "combine_id"} + ) + expected = df_mutate.assign( + avg_run_mean=df_mutate.groupby("combine_id")["avg_run"].transform( + "mean" + ) + ) + assert_frame_equal(actual, expected) + + +def test_mutate_tuple_count_not_eq_2(df_mutate): + """Raise error if length of tuple is not 2""" + with pytest.raises(ValueError, match="the tuple has to be a length of 2"): + df_mutate.mutate(("avg_run",)) + + +def test_mutate_df_tuple(df_mutate): + "Test output for a tuple" + actual = df_mutate.mutate(("avg_run", "sqrt")) + expected = df_mutate.assign(avg_run=df_mutate["avg_run"].transform("sqrt")) + assert_frame_equal(actual, expected) + + +def test_mutate_by_tuple(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate(("avg_run", "mean"), by="combine_id") + expected = df_mutate.assign( + avg_run=df_mutate.groupby("combine_id")["avg_run"].transform("mean") + ) + assert_frame_equal(actual, expected) + + +def test_mutate_tuple_df_callable(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate(("avg_run", lambda df: df.sum())) + expected = df_mutate.assign(avg_run=df_mutate["avg_run"].sum()) + assert_frame_equal(actual, expected) + + +def test_mutate_tuple_by_callable(df_mutate): + """Test output for a dictionary""" + actual = df_mutate.mutate( + ("avg_run", lambda df: df.sum()), by="combine_id" + ) + expected = df_mutate.assign( + avg_run=df_mutate.groupby("combine_id")["avg_run"].transform("sum") + ) + assert_frame_equal(actual, expected) + + +def test_mutate_tuple_by_grouped_object(df_mutate): + """Test output for a dictionary""" + grp = df_mutate.groupby("combine_id") + actual = df_mutate.mutate(("avg_run", lambda df: df.sum()), by=grp) + expected = df_mutate.assign(avg_run=grp["avg_run"].transform("sum")) + assert_frame_equal(actual, expected)