diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 007f5b7feb060..a34ec74f0b645 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -281,6 +281,8 @@ Other Enhancements all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). +- :meth:`Series.update` now supports the same keywords and functionality as :meth:`DataFrame.update`. + In particular, it has gained the keywords ``overwrite``, ``filter_func`` and ``errors`` (:issue:`22358`, :issue:`23585`) - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) @@ -297,6 +299,7 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) +- :meth:`DataFrame.update` will now try to preserve the dtype of the caller as much as possible (:issue:`23606`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e313e0f37a445..c2a4e98db331d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5203,157 +5203,13 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) + @Appender(NDFrame.update.__doc__) @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', mapping={False: 'ignore', True: 'raise'}) def update(self, other, join='left', overwrite=True, filter_func=None, errors='ignore'): - """ - Modify in place using non-NA values from another DataFrame. - - Aligns on indices. There is no return value. - - Parameters - ---------- - other : DataFrame, or object coercible into a DataFrame - Should have at least one matching index/column label - with the original DataFrame. If a Series is passed, - its name attribute must be set, and that will be - used as the column name to align with the original DataFrame. - join : {'left'}, default 'left' - Only left join is implemented, keeping the index and columns of the - original object. - overwrite : bool, default True - How to handle non-NA values for overlapping keys: - - * True: overwrite original DataFrame's values - with values from `other`. - * False: only update values that are NA in - the original DataFrame. - - filter_func : callable(1d-array) -> bool 1d-array, optional - Can choose to replace values other than NA. Return True for values - that should be updated. - errors : {'raise', 'ignore'}, default 'ignore' - If 'raise', will raise a ValueError if the DataFrame and `other` - both contain non-NA data in the same place. - - .. versionchanged :: 0.24.0 - Changed from `raise_conflict=False|True` - to `errors='ignore'|'raise'`. - - Returns - ------- - None : method directly changes calling object - - Raises - ------ - ValueError - * When `errors='raise'` and there's overlapping non-NA data. - * When `errors` is not either `'ignore'` or `'raise'` - NotImplementedError - * If `join != 'left'` - - See Also - -------- - dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-columns(s) operations. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, 5, 6], - ... 'C': [7, 8, 9]}) - >>> df.update(new_df) - >>> df - A B - 0 1 4 - 1 2 5 - 2 3 6 - - The DataFrame's length does not increase as a result of the update, - only values at matching index/column labels are updated. - - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) - >>> df.update(new_df) - >>> df - A B - 0 a d - 1 b e - 2 c f - - For Series, it's name attribute must be set. - - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) - >>> df.update(new_column) - >>> df - A B - 0 a d - 1 b y - 2 c e - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) - >>> df.update(new_df) - >>> df - A B - 0 a x - 1 b d - 2 c e - - If `other` contains NaNs the corresponding values are not updated - in the original dataframe. - - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) - >>> df.update(new_df) - >>> df - A B - 0 1 4.0 - 1 2 500.0 - 2 3 6.0 - """ - import pandas.core.computation.expressions as expressions - # TODO: Support other joins - if join != 'left': # pragma: no cover - raise NotImplementedError("Only left join is supported") - if errors not in ['ignore', 'raise']: - raise ValueError("The parameter errors must be either " - "'ignore' or 'raise'") - - if not isinstance(other, DataFrame): - other = DataFrame(other) - - other = other.reindex_like(self) - - for col in self.columns: - this = self[col].values - that = other[col].values - if filter_func is not None: - with np.errstate(all='ignore'): - mask = ~filter_func(this) | isna(that) - else: - if errors == 'raise': - mask_this = notna(that) - mask_that = notna(this) - if any(mask_this & mask_that): - raise ValueError("Data overlaps.") - - if overwrite: - mask = isna(that) - else: - mask = notna(this) - - # don't overwrite columns unecessarily - if mask.all(): - continue - - self[col] = expressions.where(mask, this, that) + super(DataFrame, self).update(other, join=join, overwrite=overwrite, + filter_func=filter_func, errors=errors) # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9f56433c6868e..9dae10dde8542 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4173,6 +4173,181 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, return self._constructor(new_data).__finalize__(self) + def update(self, other, join='left', overwrite=True, filter_func=None, + errors='ignore'): + """ + Modify in place using non-NA values from another DataFrame. + + Series/DataFrame will be aligned on indexes, and whenever possible, + the dtype of the individual Series of the caller will be preserved. + + There is no return value. + + Parameters + ---------- + other : DataFrame, or object coercible into a DataFrame + Should have at least one matching index/column label + with the original DataFrame. If a Series is passed, + its name attribute must be set, and that will be + used as the column name to align with the original DataFrame. + join : {'left'}, default 'left' + Only left join is implemented, keeping the index and columns of the + original object. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original DataFrame's values + with values from `other`. + * False: only update values that are NA in + the original DataFrame. + + filter_func : callable(1d-array) -> bool 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` + both contain non-NA data in the same place. + + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + Returns + ------- + None : method directly changes calling object + + Raises + ------ + ValueError + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` + + See Also + -------- + Series.update : Similar method for `Series`. + DataFrame.merge : For column(s)-on-columns(s) operations. + dict.update : Similar method for `dict`. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + The DataFrame's length does not increase as a result of the update, + only values at matching index/column labels are updated. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) + >>> df.update(new_df) + >>> df + A B + 0 a d + 1 b e + 2 c f + + For Series, it's name attribute must be set. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) + >>> df.update(new_column) + >>> df + A B + 0 a d + 1 b y + 2 c e + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) + >>> df.update(new_df) + >>> df + A B + 0 a x + 1 b d + 2 c e + + If `other` contains NaNs the corresponding values are not updated + in the original dataframe. + + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 500 + 2 3 6 + """ + from pandas import Series, DataFrame + # TODO: Support other joins + if join != 'left': # pragma: no cover + raise NotImplementedError("Only left join is supported") + if errors not in ['ignore', 'raise']: + raise ValueError("The parameter errors must be either " + "'ignore' or 'raise'") + + if isinstance(self, ABCSeries): + if not isinstance(other, ABCSeries): + other = Series(other) + other = other.reindex_like(self) + this = self.values + that = other.values + + # will return None if "this" remains unchanged + updated_array = missing._update_array(this, that, + overwrite=overwrite, + filter_func=filter_func, + errors=errors) + # don't overwrite unnecessarily + if updated_array is not None: + # avoid unnecessary upcasting (introduced by alignment) + try: + updated = Series(updated_array, index=self.index, + dtype=this.dtype) + except ValueError: + updated = Series(updated_array, index=self.index) + self._update_inplace(updated) + else: # DataFrame + if not isinstance(other, ABCDataFrame): + other = DataFrame(other) + + other = other.reindex_like(self) + + for col in self.columns: + this = self[col].values + that = other[col].values + + # will return None if "this" remains unchanged + updated_array = missing._update_array(this, that, + overwrite=overwrite, + filter_func=filter_func, + errors=errors) + # don't overwrite unnecessarily + if updated_array is not None: + # no problem to set DataFrame column with array + updated = updated_array + + if updated_array.dtype != this.dtype: + # avoid unnecessary upcasting (introduced by alignment) + try: + updated = Series(updated_array, index=self.index, + dtype=this.dtype) + except ValueError: + pass + self[col] = updated + def filter(self, items=None, like=None, regex=None, axis=None): """ Subset rows or columns of dataframe according to labels in diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 222873cd7f81a..b2787c958c8fb 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -21,7 +21,7 @@ ensure_float64) from pandas.core.dtypes.cast import infer_dtype_from_array -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, notna def mask_missing(arr, values_to_mask): @@ -75,6 +75,80 @@ def mask_missing(arr, values_to_mask): return mask +def update_array(this, that, overwrite=True, filter_func=None, + errors='ignore'): + """ + Update one array with non-NA values from another array. + + Parameters + ---------- + this : np.ndarray (one-dimensional) + The array being updated. + that : np.ndarray (one-dimensional) + The array being used to update. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original array's values with values from `that`. + * False: only update values that are NA in `this`. + + filter_func : callable(1d-array) -> boolean 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if `this` and `that` both contain + non-NA data in the same place. + + Raises + ------ + ValueError + When `errors='raise'` and there's overlapping non-NA data. + + Returns + ------- + updated : np.ndarray (one-dimensional) + The updated array. + + See Also + -------- + Series.update : Similar method for `Series`. + DataFrame.update : Similar method for `DataFrame`. + dict.update : Similar method for `dict`. + """ + updated = _update_array(this, that, overwrite=overwrite, + filter_func=filter_func, errors=errors) + return this if updated is None else updated + + +def _update_array(this, that, overwrite=True, filter_func=None, + errors='ignore'): + """ + Same as update_array, except we return None if `this` is not updated. + """ + import pandas.core.computation.expressions as expressions + + if filter_func is not None: + with np.errstate(all='ignore'): + mask = ~filter_func(this) | isna(that) + else: + if errors == 'raise': + mask_this = notna(that) + mask_that = notna(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isna(that) + else: + mask = notna(this) + + # don't overwrite columns unnecessarily + if mask.all(): + return None + + return expressions.where(mask, this, that) + + def clean_fill_method(method, allow_nearest=False): # asfreq is compat for resampling if method in [None, 'asfreq']: diff --git a/pandas/core/series.py b/pandas/core/series.py index 8fba3030be9d4..df7ebb210a95a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2385,14 +2385,59 @@ def combine_first(self, other): return this.where(notna(this), other) - def update(self, other): + def update(self, other, join='left', overwrite=True, filter_func=None, + errors='ignore'): """ - Modify Series in place using non-NA values from passed - Series. Aligns on index + Modify Series in place using non-NA values from passed Series. + + Series will be aligned on indexes, and whenever possible, the dtype of + the caller will be preserved. + + There is no return value. Parameters ---------- - other : Series + other : Series, or object coercible into a Series + Should have at least one matching index label with the calling + Series. + join : {'left'}, default 'left' + Only left join is implemented, keeping the index and columns of the + original object. + + .. versionadded:: 0.24.0 + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original DataFrame's values + with values from `other`. + * False: only update values that are NA in + the original DataFrame. + + .. versionadded:: 0.24.0 + filter_func : callable(1d-array) -> bool 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + + .. versionadded:: 0.24.0 + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` + both contain non-NA data in the same place. + + .. versionadded:: 0.24.0 + + Raises + ------ + ValueError + When `errors='ignore'` and there's overlapping non-NA data. + + Returns + ------- + Nothing, the Series is modified inplace. + + See Also + -------- + DataFrame.update : Similar method for `DataFrame`. + dict.update : Similar method for `dict`. Examples -------- @@ -2431,11 +2476,9 @@ def update(self, other): 2 6 dtype: int64 """ - other = other.reindex_like(self) - mask = notna(other) - - self._data = self._data.putmask(mask=mask, new=other, inplace=True) - self._maybe_update_cacher() + super(Series, self).update(other, join=join, overwrite=overwrite, + filter_func=filter_func, + errors=errors) # ---------------------------------------------------------------------- # Reindexing, sorting diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 25c5222b5f03c..75465597e6c18 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -279,6 +279,25 @@ def test_update_dtypes(self): columns=['A', 'B', 'bool1', 'bool2']) assert_frame_equal(df, expected) + df = DataFrame([[10, 100], [11, 101], [12, 102]], columns=['A', 'B']) + other = DataFrame([[61, 601], [63, 603]], columns=['A', 'B'], + index=[1, 3]) + df.update(other) + + expected = DataFrame([[10, 100], [61, 601], [12, 102]], + columns=['A', 'B']) + assert_frame_equal(df, expected) + + # we always try to keep original dtype, even if other has different one + df.update(other.astype(float)) + assert_frame_equal(df, expected) + + # if keeping the dtype is not possible, we allow upcasting + df.update(other + 0.1) + expected = DataFrame([[10., 100.], [61.1, 601.1], [12., 102.]], + columns=['A', 'B']) + assert_frame_equal(df, expected) + def test_update_nooverwrite(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 3f137bf686715..385f266a61970 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -4,13 +4,12 @@ from datetime import datetime import numpy as np -from numpy import nan import pytest import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, compat, date_range import pandas.util.testing as tm -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_frame_equal, assert_series_equal class TestSeriesCombine(): @@ -105,8 +104,8 @@ def test_combine_first(self): assert_series_equal(s, result) def test_update(self): - s = Series([1.5, nan, 3., 4., nan]) - s2 = Series([nan, 3.5, nan, 5.]) + s = Series([1.5, np.nan, 3., 4., np.nan]) + s2 = Series([np.nan, 3.5, np.nan, 5.]) s.update(s2) expected = Series([1.5, 3.5, 3., 5., np.nan]) @@ -116,8 +115,62 @@ def test_update(self): df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan - # this will fail as long as series is a sub-class of ndarray - # df['c'].update(Series(['foo'],index=[0])) ##### + df['c'].update(Series(['foo'], index=[0])) + expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]], + columns=['a', 'b', 'c']) + assert_frame_equal(df, expected) + + def test_update_dtypes(self): + s = Series([1., 2., False, True]) + + other = Series([45]) + s.update(other) + + expected = Series([45., 2., False, True]) + assert_series_equal(s, expected) + + s = Series([10, 11, 12]) + other = Series([61, 63], index=[1, 3]) + s.update(other) + + expected = Series([10, 61, 12]) + assert_series_equal(s, expected) + + # we always try to keep original dtype, even if other has different one + s.update(other.astype(float)) + assert_series_equal(s, expected) + + # if keeping the dtype is not possible, we allow upcasting + s.update(other + 0.1) + expected = Series([10., 61.1, 12.]) + assert_series_equal(s, expected) + + def test_update_nooverwrite(self): + s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan]) + other = Series([1, 3, np.nan, 7, 9], index=[1, 3, 5, 7, 9]) + + s.update(other, overwrite=False) + + expected = Series([0, 1, 2, 3, np.nan, 5, 6, 7]) + assert_series_equal(s, expected) + + def test_update_filtered(self): + # for small values, np.arange defaults to int32, + # but pandas default (e.g. for "expected" below) is int64 + s = Series(np.arange(8), dtype='int64') + other = Series(np.arange(8), dtype='int64') + 10 + + s.update(other, filter_func=lambda x: x % 2 == 1) + + expected = Series([0, 11, 2, 13, 4, 15, 6, 17]) + assert_series_equal(s, expected) + + def test_update_raise(self): + s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan]) + other = Series([1, 3, np.nan, 7, 9], index=[1, 3, 5, 7, 9]) + + with pytest.raises(ValueError, match="Data overlaps"): + s.update(other, errors='raise') def test_concat_empty_series_dtypes_roundtrips(self):