From c27703c21f7f21c91643c1ee7176c24f0076bba2 Mon Sep 17 00:00:00 2001 From: Keming Zhang Date: Fri, 26 Feb 2016 15:14:40 -0500 Subject: [PATCH 1/5] fixed issue 10503: Simple operation unexpectedly changes dtype. --- pandas/core/internals.py | 4 +++- pandas/tests/test_generic.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8563481c8564d..c4c02b0ccac99 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -702,7 +702,9 @@ def _is_empty_indexer(indexer): values[indexer] = value # coerce and try to infer the dtypes of the result - if np.isscalar(value): + if is_dtype_equal(values.dtype, getattr(value, 'dtype', None)): + dtype = value.dtype + elif np.isscalar(value): dtype, _ = _infer_dtype_from_scalar(value) else: dtype = 'infer' diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 4c7510783eda0..8b002828a336f 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -361,6 +361,16 @@ def test_head_tail(self): self._compare(o.head(-3), o.head(7)) self._compare(o.tail(-3), o.tail(7)) + def test_dtype_after_slice_update(self): + # GH10503 + + df = pd.DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, + dtype='uint32') + ix = df['a'] == 1 + newb = df.loc[ix, 'b'] + 1 + df.loc[ix, 'b'] = newb + assert_equal(df['a'].dtype, newb.dtype) + def test_sample(self): # Fixes issue: 2419 From a2c9b7bd99ac4f7268770ba95e86661c080e14ca Mon Sep 17 00:00:00 2001 From: Keming Zhang Date: Fri, 26 Feb 2016 15:31:13 -0500 Subject: [PATCH 2/5] added one test case. --- pandas/tests/test_generic.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 8b002828a336f..d5b03cb0a079a 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -364,12 +364,20 @@ def test_head_tail(self): def test_dtype_after_slice_update(self): # GH10503 - df = pd.DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, + # assigning the same type should not change the type + df1 = pd.DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, dtype='uint32') - ix = df['a'] == 1 - newb = df.loc[ix, 'b'] + 1 - df.loc[ix, 'b'] = newb - assert_equal(df['a'].dtype, newb.dtype) + ix = df1['a'] == 1 + newb1 = df1.loc[ix, 'b'] + 1 + df1.loc[ix, 'b'] = newb1 + assert_equal(df1['a'].dtype, newb1.dtype) + + # assigning a new type should get the inferred type + df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, + dtype='uint64') + newb2 = df2.loc[ix, 'b'] + df1.loc[ix, 'b'] = newb2 + assert_equal(df1['a'].dtype, np.dtype('int64')) def test_sample(self): # Fixes issue: 2419 From 329814e1133111fb5b4b46d3d0758cbefef7f10f Mon Sep 17 00:00:00 2001 From: Keming Zhang Date: Fri, 26 Feb 2016 15:42:23 -0500 Subject: [PATCH 3/5] formatted python code. --- pandas/tests/test_generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index d5b03cb0a079a..591ffc9a68c7a 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -366,7 +366,7 @@ def test_dtype_after_slice_update(self): # assigning the same type should not change the type df1 = pd.DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint32') + dtype='uint32') ix = df1['a'] == 1 newb1 = df1.loc[ix, 'b'] + 1 df1.loc[ix, 'b'] = newb1 @@ -374,7 +374,7 @@ def test_dtype_after_slice_update(self): # assigning a new type should get the inferred type df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint64') + dtype='uint64') newb2 = df2.loc[ix, 'b'] df1.loc[ix, 'b'] = newb2 assert_equal(df1['a'].dtype, np.dtype('int64')) From eaa7b76bb7d05aa7cf15e98a3ef3a645aa182f2c Mon Sep 17 00:00:00 2001 From: Keming Zhang Date: Fri, 26 Feb 2016 18:27:04 -0500 Subject: [PATCH 4/5] fixed a bug of type comparison and fixed exposed test case errors. --- pandas/core/internals.py | 3 ++- pandas/tests/test_indexing.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c4c02b0ccac99..51bd9fd0e952c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -702,7 +702,8 @@ def _is_empty_indexer(indexer): values[indexer] = value # coerce and try to infer the dtypes of the result - if is_dtype_equal(values.dtype, getattr(value, 'dtype', None)): + if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, + value.dtype): dtype = value.dtype elif np.isscalar(value): dtype, _ = _infer_dtype_from_scalar(value) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 1c0986b025acc..9497dbc6ddeef 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3256,12 +3256,12 @@ def test_multiindex_assignment(self): df.ix[4, 'c'] = arr assert_series_equal(df.ix[4, 'c'], Series(arr, index=[8, 10], name='c', - dtype='int64')) + dtype='float64')) # scalar ok df.ix[4, 'c'] = 10 assert_series_equal(df.ix[4, 'c'], Series(10, index=[8, 10], name='c', - dtype='int64')) + dtype='float64')) # invalid assignments def f(): From 2dcad0cefc3d76ae684b4dccaed8d79a4f378774 Mon Sep 17 00:00:00 2001 From: Keming Zhang Date: Fri, 26 Feb 2016 19:55:31 -0500 Subject: [PATCH 5/5] updated whatsnew with new behavior and old behavior. --- doc/source/whatsnew/v0.18.0.txt | 84 +++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 8a48314de5f77..9e62ba22d8f96 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -362,6 +362,89 @@ New Behavior: s.index print(s.to_csv(path=None)) +Changes to dtype assignment behaviors +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When a DataFrame's slice is updated with a new slice of the same +dtype, the dtype of the DataFrame will now remain the same. + +Previous Behavior: + +.. code-block:: python + + In [2]: df = pd.DataFrame({'a':[0, 1, 1], 'b':[100, 200, 300]}, dtype='uint32') + + In [3]: df.info() + + RangeIndex: 3 entries, 0 to 2 + Data columns (total 2 columns): + a 3 non-null uint32 + b 3 non-null uint32 + dtypes: uint32(2) + memory usage: 96.0 bytes + + In [4]: ix = df['a'] == 1 + + In [5]: df.loc[ix, 'b'] = df.loc[ix, 'b'] + + In [6]: df.info() + + RangeIndex: 3 entries, 0 to 2 + Data columns (total 2 columns): + a 3 non-null int64 + b 3 non-null int64 + dtypes: int64(2) + +New Behavior: + +.. ipython:: python + + df = pd.DataFrame({'a':[0, 1, 1], 'b':[100, 200, 300]}, dtype='uint32') + df.info() + ix = df['a'] == 1 + df.loc[ix, 'b'] = df.loc[ix, 'b'] + df.info() + + +When a DataFrame's integer slice is partially updated with a new slice of floats that +could potentially be downcasted to integer without losing precision, +the dtype of the slice will be set to float instead of integer. + +Previous Behavior: + +.. code-block:: python + + In [4]: df = pd.DataFrame(np.array(range(1,10)).reshape(3,3), + ...: columns=list('abc'), + ...: index=[[4,4,8], [8,10,12]]) + + In [5]: df + Out[5]: + a b c + 4 8 1 2 3 + 10 4 5 6 + 8 12 7 8 9 + + In [6]: df.ix[4, 'c'] = np.array([0., 1.]) + + In [7]: df + Out[7]: + a b c + 4 8 1 2 0 + 10 4 5 1 + 8 12 7 8 9 + +New Behavior: + +.. ipython:: python + + df = pd.DataFrame(np.array(range(1,10)).reshape(3,3), + columns=list('abc'), + index=[[4,4,8], [8,10,12]]) + df + df.ix[4, 'c'] = np.array([0., 1.]) + df + .. _whatsnew_0180.enhancements.xarray: to_xarray @@ -1120,3 +1203,4 @@ Bug Fixes - Bug in ``DataFrame.apply`` in which reduction was not being prevented for cases in which ``dtype`` was not a numpy dtype (:issue:`12244`) - Bug when initializing categorical series with a scalar value. (:issue:`12336`) - Bug when specifying a UTC ``DatetimeIndex`` by setting ``utc=True`` in ``.to_datetime`` (:issue:`11934`) +- Bug when modifying a slice of a ``DataFrame`` with the same ``dtype``, the ``dtype`` of the ``DataFrame`` could unexpected changed. (:issue:`10503`).