From 0bb83257ce32d82f126ee121fffa2712064d17e7 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:29:45 +0100 Subject: [PATCH 01/10] API: unify update to generic.py --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/frame.py | 152 ----------------------------- pandas/core/generic.py | 164 ++++++++++++++++++++++++++++++++ pandas/core/series.py | 37 +++++-- 4 files changed, 193 insertions(+), 161 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 007f5b7feb060..9f719c2d12003 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -281,6 +281,7 @@ Other Enhancements all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). +- :meth:`Series.update` now supports the same keywords and functionality as :meth:`DataFrame.update` (:issue:`22358`) - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e313e0f37a445..77c370d84f280 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5203,158 +5203,6 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) - @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', - mapping={False: 'ignore', True: 'raise'}) - def update(self, other, join='left', overwrite=True, filter_func=None, - errors='ignore'): - """ - Modify in place using non-NA values from another DataFrame. - - Aligns on indices. There is no return value. - - Parameters - ---------- - other : DataFrame, or object coercible into a DataFrame - Should have at least one matching index/column label - with the original DataFrame. If a Series is passed, - its name attribute must be set, and that will be - used as the column name to align with the original DataFrame. - join : {'left'}, default 'left' - Only left join is implemented, keeping the index and columns of the - original object. - overwrite : bool, default True - How to handle non-NA values for overlapping keys: - - * True: overwrite original DataFrame's values - with values from `other`. - * False: only update values that are NA in - the original DataFrame. - - filter_func : callable(1d-array) -> bool 1d-array, optional - Can choose to replace values other than NA. Return True for values - that should be updated. - errors : {'raise', 'ignore'}, default 'ignore' - If 'raise', will raise a ValueError if the DataFrame and `other` - both contain non-NA data in the same place. - - .. versionchanged :: 0.24.0 - Changed from `raise_conflict=False|True` - to `errors='ignore'|'raise'`. - - Returns - ------- - None : method directly changes calling object - - Raises - ------ - ValueError - * When `errors='raise'` and there's overlapping non-NA data. - * When `errors` is not either `'ignore'` or `'raise'` - NotImplementedError - * If `join != 'left'` - - See Also - -------- - dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-columns(s) operations. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, 5, 6], - ... 'C': [7, 8, 9]}) - >>> df.update(new_df) - >>> df - A B - 0 1 4 - 1 2 5 - 2 3 6 - - The DataFrame's length does not increase as a result of the update, - only values at matching index/column labels are updated. - - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) - >>> df.update(new_df) - >>> df - A B - 0 a d - 1 b e - 2 c f - - For Series, it's name attribute must be set. - - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) - >>> df.update(new_column) - >>> df - A B - 0 a d - 1 b y - 2 c e - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) - >>> df.update(new_df) - >>> df - A B - 0 a x - 1 b d - 2 c e - - If `other` contains NaNs the corresponding values are not updated - in the original dataframe. - - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) - >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) - >>> df.update(new_df) - >>> df - A B - 0 1 4.0 - 1 2 500.0 - 2 3 6.0 - """ - import pandas.core.computation.expressions as expressions - # TODO: Support other joins - if join != 'left': # pragma: no cover - raise NotImplementedError("Only left join is supported") - if errors not in ['ignore', 'raise']: - raise ValueError("The parameter errors must be either " - "'ignore' or 'raise'") - - if not isinstance(other, DataFrame): - other = DataFrame(other) - - other = other.reindex_like(self) - - for col in self.columns: - this = self[col].values - that = other[col].values - if filter_func is not None: - with np.errstate(all='ignore'): - mask = ~filter_func(this) | isna(that) - else: - if errors == 'raise': - mask_this = notna(that) - mask_that = notna(this) - if any(mask_this & mask_that): - raise ValueError("Data overlaps.") - - if overwrite: - mask = isna(that) - else: - mask = notna(this) - - # don't overwrite columns unecessarily - if mask.all(): - continue - - self[col] = expressions.where(mask, this, that) - # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9f56433c6868e..d256bb19f9b94 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -101,6 +101,32 @@ def _single_replace(self, to_replace, method, inplace, limit): return result +def _update_column(this, that, overwrite=True, filter_func=None, + raise_conflict=False): + import pandas.core.computation.expressions as expressions + + if filter_func is not None: + with np.errstate(all='ignore'): + mask = ~filter_func(this) | isna(that) + else: + if raise_conflict: + mask_this = notna(that) + mask_that = notna(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isna(that) + else: + mask = notna(this) + + # don't overwrite columns unnecessarily + if mask.all(): + return None + + return expressions.where(mask, this, that) + + class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -4173,6 +4199,144 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, return self._constructor(new_data).__finalize__(self) + def update(self, other, join='left', overwrite=True, filter_func=None, + raise_conflict=False): + """ + Modify in place using non-NA values from another DataFrame. + + Aligns on indices. There is no return value. + + Parameters + ---------- + other : DataFrame, or object coercible into a DataFrame + Should have at least one matching index/column label + with the original DataFrame. If a Series is passed, + its name attribute must be set, and that will be + used as the column name to align with the original DataFrame. + join : {'left'}, default 'left' + Only left join is implemented, keeping the index and columns of the + original object. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original DataFrame's values + with values from `other`. + * False: only update values that are NA in + the original DataFrame. + + filter_func : callable(1d-array) -> boolean 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + raise_conflict : bool, default False + If True, will raise a ValueError if the DataFrame and `other` + both contain non-NA data in the same place. + + Raises + ------ + ValueError + When `raise_conflict` is True and there's overlapping non-NA data. + + See Also + -------- + dict.update : Similar method for dictionaries. + DataFrame.merge : For column(s)-on-columns(s) operations. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + The DataFrame's length does not increase as a result of the update, + only values at matching index/column labels are updated. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) + >>> df.update(new_df) + >>> df + A B + 0 a d + 1 b e + 2 c f + + For Series, it's name attribute must be set. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) + >>> df.update(new_column) + >>> df + A B + 0 a d + 1 b y + 2 c e + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) + >>> df.update(new_df) + >>> df + A B + 0 a x + 1 b d + 2 c e + + If `other` contains NaNs the corresponding values are not updated + in the original dataframe. + + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4.0 + 1 2 500.0 + 2 3 6.0 + """ + from pandas import Series, DataFrame + # TODO: Support other joins + if join != 'left': # pragma: no cover + raise NotImplementedError("Only left join is supported") + + if isinstance(self, ABCSeries): + if not isinstance(other, ABCSeries): + other = Series(other) + other = other.reindex_like(self) + this = self.values + that = other.values + updated = _update_column(this, that, overwrite=overwrite, + filter_func=filter_func, + raise_conflict=raise_conflict) + if updated is None: + # don't overwrite Series unnecessarily + return + self._data._block.values = updated + else: # DataFrame + if not isinstance(other, ABCDataFrame): + other = DataFrame(other) + + other = other.reindex_like(self) + + for col in self.columns: + this = self[col].values + that = other[col].values + + updated = _update_column(this, that, overwrite=overwrite, + filter_func=filter_func, + raise_conflict=raise_conflict) + # don't overwrite columns unnecessarily + if updated is None: + continue + self[col] = updated + def filter(self, items=None, like=None, regex=None, axis=None): """ Subset rows or columns of dataframe according to labels in diff --git a/pandas/core/series.py b/pandas/core/series.py index 8fba3030be9d4..47190636ce7fe 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2385,14 +2385,35 @@ def combine_first(self, other): return this.where(notna(this), other) - def update(self, other): + def update(self, other, join='left', overwrite=True, filter_func=None, + raise_conflict=False): """ - Modify Series in place using non-NA values from passed - Series. Aligns on index + Modify Series in place using non-NA values from passed Series. + + Aligns on index. Parameters ---------- - other : Series + other : Series, or object coercible into a Series + Should have at least one matching index label with the calling + Series. + join : {'left'}, default 'left' + Only left join is implemented, keeping the index and columns of the + original object. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original DataFrame's values + with values from `other`. + * False: only update values that are NA in + the original DataFrame. + + filter_func : callable(1d-array) -> boolean 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + raise_conflict : bool, default False + If True, will raise a ValueError if the DataFrame and `other` + both contain non-NA data in the same place. Examples -------- @@ -2431,11 +2452,9 @@ def update(self, other): 2 6 dtype: int64 """ - other = other.reindex_like(self) - mask = notna(other) - - self._data = self._data.putmask(mask=mask, new=other, inplace=True) - self._maybe_update_cacher() + super(Series, self).update(other, join=join, overwrite=overwrite, + filter_func=filter_func, + raise_conflict=raise_conflict) # ---------------------------------------------------------------------- # Reindexing, sorting From 56f569b0c081e01c9fdb7447efc6b01be8125449 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 17 Oct 2018 01:14:55 +0200 Subject: [PATCH 02/10] Fix docstrings --- pandas/core/generic.py | 7 ++++++- pandas/core/series.py | 13 +++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d256bb19f9b94..fc7c801de5bc1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4236,10 +4236,15 @@ def update(self, other, join='left', overwrite=True, filter_func=None, ValueError When `raise_conflict` is True and there's overlapping non-NA data. + Returns + ------- + Nothing, the object is modified inplace. + See Also -------- - dict.update : Similar method for dictionaries. + Series.update : Similar method for `Series`. DataFrame.merge : For column(s)-on-columns(s) operations. + dict.update : Similar method for `dict`. Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 47190636ce7fe..82d2b5ecbcd30 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2415,6 +2415,11 @@ def update(self, other, join='left', overwrite=True, filter_func=None, If True, will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. + See Also + -------- + DataFrame.update : Similar method for `DataFrame`. + dict.update : Similar method for `dict` + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -2447,10 +2452,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, >>> s = pd.Series([1, 2, 3]) >>> s.update(pd.Series([4, np.nan, 6])) >>> s - 0 4 - 1 2 - 2 6 - dtype: int64 + 0 4.0 + 1 2.0 + 2 6.0 + dtype: float64 """ super(Series, self).update(other, join=join, overwrite=overwrite, filter_func=filter_func, From e1abe77923e1b5d80dd811b10fdc047bfb5dd8bd Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:30:53 +0100 Subject: [PATCH 03/10] Review (jreback) --- doc/source/whatsnew/v0.24.0.rst | 3 +- pandas/core/frame.py | 8 ++++ pandas/core/generic.py | 61 +++++++++--------------------- pandas/core/missing.py | 66 ++++++++++++++++++++++++++++++++- pandas/core/series.py | 15 ++++++-- 5 files changed, 104 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 9f719c2d12003..de911b90f650a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -281,7 +281,8 @@ Other Enhancements all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). -- :meth:`Series.update` now supports the same keywords and functionality as :meth:`DataFrame.update` (:issue:`22358`) +- :meth:`Series.update` now supports the same keywords and functionality as :meth:`DataFrame.update`. + In particular, it has gained the keywords ``overwrite``, ``filter_func`` and ``errors`` (:issue:`22358`) - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 77c370d84f280..c2a4e98db331d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5203,6 +5203,14 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) + @Appender(NDFrame.update.__doc__) + @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', + mapping={False: 'ignore', True: 'raise'}) + def update(self, other, join='left', overwrite=True, filter_func=None, + errors='ignore'): + super(DataFrame, self).update(other, join=join, overwrite=overwrite, + filter_func=filter_func, errors=errors) + # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fc7c801de5bc1..6c87d7e60b5f1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -101,32 +101,6 @@ def _single_replace(self, to_replace, method, inplace, limit): return result -def _update_column(this, that, overwrite=True, filter_func=None, - raise_conflict=False): - import pandas.core.computation.expressions as expressions - - if filter_func is not None: - with np.errstate(all='ignore'): - mask = ~filter_func(this) | isna(that) - else: - if raise_conflict: - mask_this = notna(that) - mask_that = notna(this) - if any(mask_this & mask_that): - raise ValueError("Data overlaps.") - - if overwrite: - mask = isna(that) - else: - mask = notna(this) - - # don't overwrite columns unnecessarily - if mask.all(): - return None - - return expressions.where(mask, this, that) - - class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -4200,7 +4174,7 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, return self._constructor(new_data).__finalize__(self) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ Modify in place using non-NA values from another DataFrame. @@ -4227,8 +4201,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, filter_func : callable(1d-array) -> boolean 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. - raise_conflict : bool, default False - If True, will raise a ValueError if the DataFrame and `other` + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. Raises @@ -4317,13 +4291,15 @@ def update(self, other, join='left', overwrite=True, filter_func=None, other = other.reindex_like(self) this = self.values that = other.values - updated = _update_column(this, that, overwrite=overwrite, - filter_func=filter_func, - raise_conflict=raise_conflict) - if updated is None: - # don't overwrite Series unnecessarily - return - self._data._block.values = updated + + # missing.update_array returns an np.ndarray + updated_values = missing.update_array(this, that, + overwrite=overwrite, + filter_func=filter_func, + errors=errors) + # don't overwrite unnecessarily + if updated_values is not None: + self._update_inplace(Series(updated_values, index=self.index)) else: # DataFrame if not isinstance(other, ABCDataFrame): other = DataFrame(other) @@ -4334,13 +4310,12 @@ def update(self, other, join='left', overwrite=True, filter_func=None, this = self[col].values that = other[col].values - updated = _update_column(this, that, overwrite=overwrite, - filter_func=filter_func, - raise_conflict=raise_conflict) - # don't overwrite columns unnecessarily - if updated is None: - continue - self[col] = updated + updated = missing.update_array(this, that, overwrite=overwrite, + filter_func=filter_func, + errors=errors) + # don't overwrite unnecessarily + if updated is not None: + self[col] = updated def filter(self, items=None, like=None, regex=None, axis=None): """ diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 222873cd7f81a..1b8b2fa3f4656 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -21,7 +21,7 @@ ensure_float64) from pandas.core.dtypes.cast import infer_dtype_from_array -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, notna def mask_missing(arr, values_to_mask): @@ -75,6 +75,70 @@ def mask_missing(arr, values_to_mask): return mask +def update_array(this, that, overwrite=True, filter_func=None, + errors='ignore'): + """ + Update one array with non-NA values from another array. + + Parameters + ---------- + this : np.ndarray (one-dimensional) + The array being updated. + that : np.ndarray (one-dimensional) + The array being used to update. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original array's values with values from `that`. + * False: only update values that are NA in `this`. + + filter_func : callable(1d-array) -> boolean 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if `this` and `that` both contain + non-NA data in the same place. + + Raises + ------ + ValueError + When `errors='raise'` and there's overlapping non-NA data. + + Returns + ------- + updated : np.ndarray (one-dimensional) or None + The updated array. Return None if `this` remains unchanged + + See Also + -------- + Series.update : Similar method for `Series`. + DataFrame.update : Similar method for `DataFrame`. + dict.update : Similar method for `dict`. + """ + import pandas.core.computation.expressions as expressions + + if filter_func is not None: + with np.errstate(all='ignore'): + mask = ~filter_func(this) | isna(that) + else: + if errors == 'raise': + mask_this = notna(that) + mask_that = notna(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isna(that) + else: + mask = notna(this) + + # don't overwrite columns unnecessarily + if mask.all(): + return None + + return expressions.where(mask, this, that) + + def clean_fill_method(method, allow_nearest=False): # asfreq is compat for resampling if method in [None, 'asfreq']: diff --git a/pandas/core/series.py b/pandas/core/series.py index 82d2b5ecbcd30..1863064703426 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2386,7 +2386,7 @@ def combine_first(self, other): return this.where(notna(this), other) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ Modify Series in place using non-NA values from passed Series. @@ -2400,6 +2400,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, join : {'left'}, default 'left' Only left join is implemented, keeping the index and columns of the original object. + + .. versionadded:: 0.24.0 overwrite : bool, default True How to handle non-NA values for overlapping keys: @@ -2408,13 +2410,18 @@ def update(self, other, join='left', overwrite=True, filter_func=None, * False: only update values that are NA in the original DataFrame. + .. versionadded:: 0.24.0 filter_func : callable(1d-array) -> boolean 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. - raise_conflict : bool, default False - If True, will raise a ValueError if the DataFrame and `other` + + .. versionadded:: 0.24.0 + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. + .. versionadded:: 0.24.0 + See Also -------- DataFrame.update : Similar method for `DataFrame`. @@ -2459,7 +2466,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, """ super(Series, self).update(other, join=join, overwrite=overwrite, filter_func=filter_func, - raise_conflict=raise_conflict) + errors=errors) # ---------------------------------------------------------------------- # Reindexing, sorting From 802f6acd7e0005f1debc8aa3e5bff2834abd6d44 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 9 Nov 2018 00:59:26 +0100 Subject: [PATCH 04/10] Add further tests for Series case --- pandas/tests/series/test_combine_concat.py | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 3f137bf686715..65854eb5a4d99 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -119,6 +119,31 @@ def test_update(self): # this will fail as long as series is a sub-class of ndarray # df['c'].update(Series(['foo'],index=[0])) ##### + def test_update_nooverwrite(self): + s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan]) + other = Series([1, 3, np.nan, 7, 9], index=[1, 3, 5, 7, 9]) + + s.update(other, overwrite=False) + + expected = Series([0, 1, 2, 3, np.nan, 5, 6, 7]) + assert_series_equal(s, expected) + + def test_update_filtered(self): + s = Series(np.arange(8), dtype='int64') + other = Series(np.arange(8), dtype='int64') + 10 + + s.update(other, filter_func=lambda x: x % 2 == 1) + + expected = Series([0, 11, 2, 13, 4, 15, 6, 17]) + assert_series_equal(s, expected) + + def test_update_raise(self): + s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan]) + other = Series([1, 3, np.nan, 7, 9], index=[1, 3, 5, 7, 9]) + + with tm.assert_raises_regex(ValueError, "Data overlaps"): + s.update(other, errors='raise') + def test_concat_empty_series_dtypes_roundtrips(self): # round-tripping with self & like self From 07784f07fb51dc2c45167c8cff48db99fb29a9f8 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:32:08 +0100 Subject: [PATCH 05/10] Update whatsnew --- doc/source/whatsnew/v0.24.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index de911b90f650a..6066f0ec44147 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -282,7 +282,7 @@ Other Enhancements - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). - :meth:`Series.update` now supports the same keywords and functionality as :meth:`DataFrame.update`. - In particular, it has gained the keywords ``overwrite``, ``filter_func`` and ``errors`` (:issue:`22358`) + In particular, it has gained the keywords ``overwrite``, ``filter_func`` and ``errors`` (:issue:`22358`, :issue:`23585`) - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`) @@ -1026,7 +1026,7 @@ Deprecations - The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`) - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) -- The methods :meth:`DataFrame.update` and :meth:`Panel.update` have deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`) +- The method :meth:`DataFrame.update` has deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`) - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) - :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) From f120d65568c9c26f510e2548d8beafbc039acdbb Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 9 Nov 2018 08:45:15 +0100 Subject: [PATCH 06/10] Keep dtype whenever possible; add _update_array; docstring fixes --- pandas/core/generic.py | 51 +++++++++++++++------- pandas/core/missing.py | 14 +++++- pandas/core/series.py | 26 ++++++++--- pandas/tests/frame/test_combine_concat.py | 19 ++++++++ pandas/tests/series/test_combine_concat.py | 39 ++++++++++++++--- 5 files changed, 120 insertions(+), 29 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c87d7e60b5f1..17fbe060dd43a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4178,7 +4178,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, """ Modify in place using non-NA values from another DataFrame. - Aligns on indices. There is no return value. + Series/DataFrame will be aligned on indexes, and whenever possible, + the dtype of the individual Series of the caller will be preserved. + + There is no return value. Parameters ---------- @@ -4198,7 +4201,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, * False: only update values that are NA in the original DataFrame. - filter_func : callable(1d-array) -> boolean 1d-array, optional + filter_func : callable(1d-array) -> bool 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. errors : {'raise', 'ignore'}, default 'ignore' @@ -4208,7 +4211,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, Raises ------ ValueError - When `raise_conflict` is True and there's overlapping non-NA data. + When `errors='ignore'` and there's overlapping non-NA data. Returns ------- @@ -4275,10 +4278,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) >>> df.update(new_df) >>> df - A B - 0 1 4.0 - 1 2 500.0 - 2 3 6.0 + A B + 0 1 4 + 1 2 500 + 2 3 6 """ from pandas import Series, DataFrame # TODO: Support other joins @@ -4292,14 +4295,20 @@ def update(self, other, join='left', overwrite=True, filter_func=None, this = self.values that = other.values - # missing.update_array returns an np.ndarray - updated_values = missing.update_array(this, that, + # will return None if "this" remains unchanged + updated_array = missing._update_array(this, that, overwrite=overwrite, filter_func=filter_func, errors=errors) # don't overwrite unnecessarily - if updated_values is not None: - self._update_inplace(Series(updated_values, index=self.index)) + if updated_array is not None: + # avoid unnecessary upcasting (introduced by alignment) + try: + updated = Series(updated_array, index=self.index, + dtype=this.dtype) + except ValueError: + updated = Series(updated_array, index=self.index) + self._update_inplace(updated) else: # DataFrame if not isinstance(other, ABCDataFrame): other = DataFrame(other) @@ -4310,11 +4319,23 @@ def update(self, other, join='left', overwrite=True, filter_func=None, this = self[col].values that = other[col].values - updated = missing.update_array(this, that, overwrite=overwrite, - filter_func=filter_func, - errors=errors) + # will return None if "this" remains unchanged + updated_array = missing._update_array(this, that, + overwrite=overwrite, + filter_func=filter_func, + errors=errors) # don't overwrite unnecessarily - if updated is not None: + if updated_array is not None: + # no problem to set DataFrame column with array + updated = updated_array + + if updated_array.dtype != this.dtype: + # avoid unnecessary upcasting (introduced by alignment) + try: + updated = Series(updated_array, index=self.index, + dtype=this.dtype) + except ValueError: + pass self[col] = updated def filter(self, items=None, like=None, regex=None, axis=None): diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 1b8b2fa3f4656..b6842d83ae850 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -106,8 +106,8 @@ def update_array(this, that, overwrite=True, filter_func=None, Returns ------- - updated : np.ndarray (one-dimensional) or None - The updated array. Return None if `this` remains unchanged + updated : np.ndarray (one-dimensional) + The updated array. See Also -------- @@ -115,6 +115,16 @@ def update_array(this, that, overwrite=True, filter_func=None, DataFrame.update : Similar method for `DataFrame`. dict.update : Similar method for `dict`. """ + updated = _update_array(this, that, overwrite=overwrite, + filter_func=filter_func, errors=errors) + return this if updated is None else updated + + +def _update_array(this, that, overwrite=True, filter_func=None, + errors='ignore'): + """ + Same as update_array, except we return None if `this` is not updated. + """ import pandas.core.computation.expressions as expressions if filter_func is not None: diff --git a/pandas/core/series.py b/pandas/core/series.py index 1863064703426..df7ebb210a95a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2390,7 +2390,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, """ Modify Series in place using non-NA values from passed Series. - Aligns on index. + Series will be aligned on indexes, and whenever possible, the dtype of + the caller will be preserved. + + There is no return value. Parameters ---------- @@ -2411,7 +2414,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, the original DataFrame. .. versionadded:: 0.24.0 - filter_func : callable(1d-array) -> boolean 1d-array, optional + filter_func : callable(1d-array) -> bool 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. @@ -2422,10 +2425,19 @@ def update(self, other, join='left', overwrite=True, filter_func=None, .. versionadded:: 0.24.0 + Raises + ------ + ValueError + When `errors='ignore'` and there's overlapping non-NA data. + + Returns + ------- + Nothing, the Series is modified inplace. + See Also -------- DataFrame.update : Similar method for `DataFrame`. - dict.update : Similar method for `dict` + dict.update : Similar method for `dict`. Examples -------- @@ -2459,10 +2471,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, >>> s = pd.Series([1, 2, 3]) >>> s.update(pd.Series([4, np.nan, 6])) >>> s - 0 4.0 - 1 2.0 - 2 6.0 - dtype: float64 + 0 4 + 1 2 + 2 6 + dtype: int64 """ super(Series, self).update(other, join=join, overwrite=overwrite, filter_func=filter_func, diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 25c5222b5f03c..75465597e6c18 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -279,6 +279,25 @@ def test_update_dtypes(self): columns=['A', 'B', 'bool1', 'bool2']) assert_frame_equal(df, expected) + df = DataFrame([[10, 100], [11, 101], [12, 102]], columns=['A', 'B']) + other = DataFrame([[61, 601], [63, 603]], columns=['A', 'B'], + index=[1, 3]) + df.update(other) + + expected = DataFrame([[10, 100], [61, 601], [12, 102]], + columns=['A', 'B']) + assert_frame_equal(df, expected) + + # we always try to keep original dtype, even if other has different one + df.update(other.astype(float)) + assert_frame_equal(df, expected) + + # if keeping the dtype is not possible, we allow upcasting + df.update(other + 0.1) + expected = DataFrame([[10., 100.], [61.1, 601.1], [12., 102.]], + columns=['A', 'B']) + assert_frame_equal(df, expected) + def test_update_nooverwrite(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 65854eb5a4d99..9392d2caee925 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, compat, date_range import pandas.util.testing as tm -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_series_equal, assert_frame_equal class TestSeriesCombine(): @@ -105,8 +105,8 @@ def test_combine_first(self): assert_series_equal(s, result) def test_update(self): - s = Series([1.5, nan, 3., 4., nan]) - s2 = Series([nan, 3.5, nan, 5.]) + s = Series([1.5, np.nan, 3., 4., np.nan]) + s2 = Series([np.nan, 3.5, np.nan, 5.]) s.update(s2) expected = Series([1.5, 3.5, 3., 5., np.nan]) @@ -116,8 +116,35 @@ def test_update(self): df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan - # this will fail as long as series is a sub-class of ndarray - # df['c'].update(Series(['foo'],index=[0])) ##### + df['c'].update(Series(['foo'], index=[0])) + expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]], + columns=['a', 'b', 'c']) + assert_frame_equal(df, expected) + + def test_update_dtypes(self): + s = Series([1., 2., False, True]) + + other = Series([45]) + s.update(other) + + expected = Series([45., 2., False, True]) + assert_series_equal(s, expected) + + s = Series([10, 11, 12]) + other = Series([61, 63], index=[1, 3]) + s.update(other) + + expected = Series([10, 61, 12]) + assert_series_equal(s, expected) + + # we always try to keep original dtype, even if other has different one + s.update(other.astype(float)) + assert_series_equal(s, expected) + + # if keeping the dtype is not possible, we allow upcasting + s.update(other + 0.1) + expected = Series([10., 61.1, 12.]) + assert_series_equal(s, expected) def test_update_nooverwrite(self): s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan]) @@ -129,6 +156,8 @@ def test_update_nooverwrite(self): assert_series_equal(s, expected) def test_update_filtered(self): + # for small values, np.arange defaults to int32, + # but pandas default (e.g. for "expected" below) is int64 s = Series(np.arange(8), dtype='int64') other = Series(np.arange(8), dtype='int64') + 10 From e500a22196d916f8a4aae43eb3618ec27d56e559 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:33:26 +0100 Subject: [PATCH 07/10] Fix Panel.update --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/tests/test_panel.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6066f0ec44147..9889f1ee3c1db 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1026,7 +1026,7 @@ Deprecations - The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`) - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) -- The method :meth:`DataFrame.update` has deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`) +- The methods :meth:`DataFrame.update` and :meth:`Panel.update` have deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`) - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) - :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 0e45fd6411ac0..522141da1015c 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2358,6 +2358,7 @@ def test_update_raise_on_overlap(self): [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) + other = Panel([[[]]]) with pytest.raises(ValueError, match='Data overlaps'): pan.update(pan, errors='raise') From d49b74280d6d1523456397480514550f0c0a392a Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 13 Nov 2018 08:07:04 +0100 Subject: [PATCH 08/10] Update whatsnew with GH reference --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 9889f1ee3c1db..a34ec74f0b645 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -299,6 +299,7 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) +- :meth:`DataFrame.update` will now try to preserve the dtype of the caller as much as possible (:issue:`23606`) .. _whatsnew_0240.api_breaking.deps: From 9183b508eb1bc5e846863489c2fa8287fd79b0f1 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 11 Nov 2018 11:13:57 +0100 Subject: [PATCH 09/10] Lint, isort, fixes --- pandas/core/generic.py | 17 ++++++++++++----- pandas/core/missing.py | 2 +- pandas/tests/series/test_combine_concat.py | 5 ++--- pandas/tests/test_panel.py | 1 - 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 17fbe060dd43a..d20b1cad501ee 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4208,14 +4208,21 @@ def update(self, other, join='left', overwrite=True, filter_func=None, If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. - Raises - ------ - ValueError - When `errors='ignore'` and there's overlapping non-NA data. + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. Returns ------- - Nothing, the object is modified inplace. + None : method directly changes calling object + + Raises + ------ + ValueError + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` See Also -------- diff --git a/pandas/core/missing.py b/pandas/core/missing.py index b6842d83ae850..b2787c958c8fb 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -116,7 +116,7 @@ def update_array(this, that, overwrite=True, filter_func=None, dict.update : Similar method for `dict`. """ updated = _update_array(this, that, overwrite=overwrite, - filter_func=filter_func, errors=errors) + filter_func=filter_func, errors=errors) return this if updated is None else updated diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 9392d2caee925..385f266a61970 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -4,13 +4,12 @@ from datetime import datetime import numpy as np -from numpy import nan import pytest import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, compat, date_range import pandas.util.testing as tm -from pandas.util.testing import assert_series_equal, assert_frame_equal +from pandas.util.testing import assert_frame_equal, assert_series_equal class TestSeriesCombine(): @@ -170,7 +169,7 @@ def test_update_raise(self): s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan]) other = Series([1, 3, np.nan, 7, 9], index=[1, 3, 5, 7, 9]) - with tm.assert_raises_regex(ValueError, "Data overlaps"): + with pytest.raises(ValueError, match="Data overlaps"): s.update(other, errors='raise') def test_concat_empty_series_dtypes_roundtrips(self): diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 522141da1015c..0e45fd6411ac0 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2358,7 +2358,6 @@ def test_update_raise_on_overlap(self): [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) - other = Panel([[[]]]) with pytest.raises(ValueError, match='Data overlaps'): pan.update(pan, errors='raise') From 411741a1aa70cb3e2ec2276a715b46c60f65ac1e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:47:27 +0100 Subject: [PATCH 10/10] fix oversight --- pandas/core/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d20b1cad501ee..9dae10dde8542 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4294,6 +4294,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # TODO: Support other joins if join != 'left': # pragma: no cover raise NotImplementedError("Only left join is supported") + if errors not in ['ignore', 'raise']: + raise ValueError("The parameter errors must be either " + "'ignore' or 'raise'") if isinstance(self, ABCSeries): if not isinstance(other, ABCSeries):