pyjanitor-devs · samukweku · Mar 2, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## [Unreleased]
 
 -  [ENH] Added support for pd.Series.select - Issue #1394 @samukweku
+-  [ENH] Added suport for janitor.mutate - Issue #1226 @samukweku
 
 ## [v0.30.0] - 2024-12-04
 

diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py
@@ -57,6 +57,7 @@
 from .limit_column_characters import limit_column_characters
 from .min_max_scale import min_max_scale
 from .move import move
+from .mutate import mutate
 from .pivot import (
     pivot_longer,
     pivot_longer_spec,
@@ -140,6 +141,7 @@
     "limit_column_characters",
     "min_max_scale",
     "move",
+    "mutate",
     "pivot_longer",
     "pivot_longer_spec",
     "pivot_wider",

diff --git a/janitor/functions/mutate.py b/janitor/functions/mutate.py
@@ -0,0 +1,269 @@
+"""Implementation of mutate."""
+
+from __future__ import annotations
+
+from functools import singledispatch
+from typing import Any
+
+import pandas as pd
+import pandas_flavor as pf
+from pandas.api.types import is_scalar
+from pandas.core.common import apply_if_callable
+from pandas.core.groupby.generic import DataFrameGroupBy
+
+from janitor.functions.select import get_index_labels
+from janitor.utils import check
+
+
+@pf.register_dataframe_method
+def mutate(
+    df: pd.DataFrame,
+    *args: tuple[dict | tuple],
+    by: Any = None,
+    copy: bool = True,
+) -> pd.DataFrame:
+    """
+
+    !!! info "New in version 0.31.0"
+
+    !!!note
+
+        Before reaching for `mutate`, try `pd.DataFrame.assign`.
+
+    mutate creates new columns that are functions of existing columns.
+    It can also modify columns (if the name is the same as an existing column).
+
+    The argument provided to *args* should be either a dictionary, a tuple or a callable.
+
+    - **dictionary argument**:
+    If the argument is a dictionary,
+    the value in the `{key:value}` pairing
+    should be either a string, a callable or a tuple.
+
+        - If the value in the dictionary
+        is a string or a callable,
+        the key of the dictionary
+        should be an existing column name.
+
+        !!!note
+
+            - If the value is a string,
+            the string should be a pandas string function,
+            e.g "sum", "mean", etc.
+
+        - If the value of the dictionary is a tuple,
+        it should be of length 2, and of the form
+        `(column_name, mutation_func)`,
+        where `column_name` should exist in the DataFrame,
+        and `mutation_func` should be either a string or a callable.
+        The key in the dictionary can be a new column name.
+
+        !!!note
+
+            - If `mutation_func` is a string,
+            the string should be a pandas string function,
+            e.g "sum", "mean", etc.
+
+
+
+    - **tuple argument**:
+    If the argument is a tuple, it should be of length 2,
+    and of the form
+    `(column_name, mutation_func)`,
+    where `column_name` should exist in the DataFrame,
+    and `mutation_func` should be either a string or a callable.
+
+        !!!note
+
+            - if `mutation_func` is a string,
+            the string should be a pandas string function,
+            e.g "sum", "mean", etc.
+
+        !!!note
+
+            - `column_name` can be anything supported by the
+            [`select`][janitor.functions.select.select] syntax;
+            as such multiple columns can be processed here -
+            they will be processed individually.
+
+
+
+    - **callable argument**:
+    If the argument is a callable, the callable is applied
+    on the DataFrame or GroupBy object.
+    The result from the callable should be a pandas Series
+    or DataFrame.
+
+    `by` can be a `DataFrameGroupBy` object; it is assumed that
+    `by` was created from `df` - the onus is on the user to
+    ensure that, or the aggregations may yield incorrect results.
+
+    `by` accepts anything supported by `pd.DataFrame.groupby`.
+
+    Arguments supported in `pd.DataFrame.groupby`
+    can also be passed to `by` via a dictionary.
+
+    Mutation does not occur on the original DataFrame;
+    change this behaviour by passing `copy=False`.
+
+    Examples:
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> import janitor
+        >>> df = pd.DataFrame({
+        ...     "col1": [5, 10, 15],
+        ...     "col2": [3, 6, 9],
+        ...     "col3": [10, 100, 1_000],
+        ... })
+
+        Transformation via a dictionary:
+        >>> df.mutate(
+        ...     {"col4": ('col1',np.log10),
+        ...      "col1": np.log10}
+        ...     )
+               col1  col2  col3      col4
+        0  0.698970     3    10  0.698970
+        1  1.000000     6   100  1.000000
+        2  1.176091     9  1000  1.176091
+
+        Transformation via a tuple:
+        >>> df.mutate(("col1", np.log10))
+               col1  col2  col3
+        0  0.698970     3    10
+        1  1.000000     6   100
+        2  1.176091     9  1000
+        >>> df.mutate(("col*", np.log10))
+               col1      col2  col3
+        0  0.698970  0.477121   1.0
+        1  1.000000  0.778151   2.0
+        2  1.176091  0.954243   3.0
+
+        Transformation via a callable:
+        >>> df.mutate(lambda df: df.sum(axis=1).rename('total'))
+           col1  col2  col3  total
+        0     5     3    10     18
+        1    10     6   100    116
+        2    15     9  1000   1024
+
+        Transformation in the presence of a groupby:
+        >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
+        ...         'avg_run': [3, 4, 1, 3, 2, 4],
+        ...         'combine_id': [100200, 100200,
+        ...                        101200, 101200,
+        ...                        102201, 103202]}
+        >>> df = pd.DataFrame(data)
+        >>> df.mutate({"avg_run_2":("avg_run","mean")}, by='combine_id')
+           avg_jump  avg_run  combine_id  avg_run_2
+        0         3        3      100200        3.5
+        1         4        4      100200        3.5
+        2         1        1      101200        2.0
+        3         2        3      101200        2.0
+        4         3        2      102201        2.0
+        5         4        4      103202        4.0
+
+    Args:
+        df: A pandas DataFrame.
+        args: Either a dictionary or a tuple.
+        by: Column(s) to group by.
+
+    Raises:
+        ValueError: If a tuple is passed and the length is not 2.
+
+    Returns:
+        A pandas DataFrame or Series with aggregated columns.
+    """  # noqa: E501
+    check("copy", copy, [bool])
+    if by is not None:
+        if isinstance(by, DataFrameGroupBy):
+            # it is assumed that by is created from df
+            # onus is on user to ensure that
+            pass
+        elif isinstance(by, dict):
+            by = df.groupby(**by)
+        else:
+            if is_scalar(by):
+                by = [by]
+            by = df.groupby(by, sort=False, observed=True)
+    if copy:
+        df = df.copy(deep=None)
+    for arg in args:
+        df = _mutator(arg, df=df, by=by)
+    return df
+
+
+@singledispatch
+def _mutator(arg, df, by):
+    if not callable(arg):
+        raise NotImplementedError(
+            f"janitor.mutate is not supported for {type(arg)}"
+        )
+    if by is None:
+        val = df
+    else:
+        val = by
+    outcome = _process_maybe_callable(func=arg, obj=val)
+    if isinstance(outcome, pd.Series):
+        if not outcome.name:
+            raise ValueError("Ensure the pandas Series object has a name")
+        df[outcome.name] = outcome
+        return df
+    if isinstance(outcome, pd.DataFrame):
+        for column in outcome:
+            df[column] = outcome[column]
+        return df
+    raise TypeError(
+        "The output from a callable should be a named Series or a DataFrame"
+    )
+
+
+@_mutator.register(dict)
+def _(arg, df, by):
+    """Dispatch function for dictionary"""
+    if by is None:
+        val = df
+    else:
+        val = by
+    for column_name, mutator in arg.items():
+        if isinstance(mutator, tuple):
+            column, func = mutator
+            column = _process_within_dict(mutator=func, obj=val[column])
+        else:
+            column = _process_within_dict(
+                mutator=mutator, obj=val[column_name]
+            )
+        df[column_name] = column
+    return df
+
+
+@_mutator.register(tuple)
+def _(arg, df, by):
+    """Dispatch function for tuple"""
+    if len(arg) != 2:
+        raise ValueError("the tuple has to be a length of 2")
+    column_names, mutator = arg
+    column_names = get_index_labels(arg=[column_names], df=df, axis="columns")
+    mapping = {column_name: mutator for column_name in column_names}
+    return _mutator(mapping, df=df, by=by)
+
+
+def _process_maybe_callable(func: callable, obj):
+    """Function to handle callables"""
+    try:
+        column = obj.transform(func)
+    except:  # noqa: E722
+        column = apply_if_callable(maybe_callable=func, obj=obj)
+    return column
+
+
+def _process_maybe_string(func: str, obj):
+    """Function to handle pandas string functions"""
+    # treat as a pandas approved string function
+    # https://pandas.pydata.org/docs/user_guide/groupby.html#built-in-aggregation-methods
+    return obj.transform(func)
+
+
+def _process_within_dict(mutator, obj):
+    """Handle str/callables within a dictionary"""
+    if isinstance(mutator, str):
+        return _process_maybe_string(func=mutator, obj=obj)
+    return _process_maybe_callable(func=mutator, obj=obj)
diff --git a/mkdocs/api/functions.md b/mkdocs/api/functions.md
@@ -43,6 +43,7 @@
         - limit_column_characters
         - min_max_scale
         - move
+        - mutate
         - pivot
         - process_text
         - remove_columns