Skip to content

BUG: replace of numeric by string / dtype coversion (GH15743) #15812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
2 changes: 1 addition & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Release Notes
=============

The list of changes to pandas between each release can be found
The list of changes to Pandas between each release can be found
[here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full
details, see the commit logs at http://github.com/pandas-dev/pandas.
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,8 @@ Bug Fixes

- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`)
- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`)

- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)

Expand Down Expand Up @@ -985,7 +987,8 @@ Bug Fixes

- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`)
- Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`)
- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI for the future if you put this somewhere in the Bug Fixes section , rather than the end you won't have merge conflicts. (we have blank lines for this purpose)

- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`)
26 changes: 14 additions & 12 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@

from pandas.compat import range, string_types
from pandas.types.common import (is_numeric_v_string_like,
is_float_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_integer_dtype,
_ensure_float64, is_scalar,
needs_i8_conversion, is_integer)
is_float_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_integer_dtype,
is_scalar,
is_integer,
needs_i8_conversion,
_ensure_float64)

from pandas.types.cast import infer_dtype_from_array
from pandas.types.missing import isnull


Expand All @@ -21,11 +27,11 @@ def mask_missing(arr, values_to_mask):
Return a masking array of same size/shape as arr
with entries equaling any member of values_to_mask set to True
"""
if not isinstance(values_to_mask, (list, np.ndarray)):
values_to_mask = [values_to_mask]
dtype, values_to_mask = infer_dtype_from_array(values_to_mask)

try:
values_to_mask = np.array(values_to_mask, dtype=arr.dtype)
values_to_mask = np.array(values_to_mask, dtype=dtype)

except Exception:
values_to_mask = np.array(values_to_mask, dtype=object)

Expand Down Expand Up @@ -409,7 +415,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
if axis != 0: # pragma: no cover
raise AssertionError("cannot interpolate on a ndim == 1 with "
"axis != 0")
values = values.reshape(tuple((1, ) + values.shape))
values = values.reshape(tuple((1,) + values.shape))

if fill_value is None:
mask = None
Expand Down Expand Up @@ -447,7 +453,6 @@ def wrapper(arr, mask, limit=None):


def pad_1d(values, limit=None, mask=None, dtype=None):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

normally don't like to edit think not-associated with the PR (e.g. you may have some editor setting which change this)...no big deal

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok... Sorry for that... I'm using IntelliJ IDEA, and it formatted all file with PEP8 standard

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no problem. we dont' quite follow PEP8 (as flake8 doesn't actually)......

if dtype is None:
dtype = values.dtype
_method = None
Expand All @@ -472,7 +477,6 @@ def pad_1d(values, limit=None, mask=None, dtype=None):


def backfill_1d(values, limit=None, mask=None, dtype=None):

if dtype is None:
dtype = values.dtype
_method = None
Expand All @@ -498,7 +502,6 @@ def backfill_1d(values, limit=None, mask=None, dtype=None):


def pad_2d(values, limit=None, mask=None, dtype=None):

if dtype is None:
dtype = values.dtype
_method = None
Expand Down Expand Up @@ -528,7 +531,6 @@ def pad_2d(values, limit=None, mask=None, dtype=None):


def backfill_2d(values, limit=None, mask=None, dtype=None):

if dtype is None:
dtype = values.dtype
_method = None
Expand Down
25 changes: 15 additions & 10 deletions pandas/tests/frame/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ def test_replace_dtypes(self):
expected = DataFrame({'datetime64': Index([now] * 3)})
assert_frame_equal(result, expected)

def test_replace_input_formats(self):
def test_replace_input_formats_listlike(self):
# both dicts
to_rep = {'A': np.nan, 'B': 0, 'C': ''}
values = {'A': 0, 'B': -1, 'C': 'missing'}
Expand All @@ -812,15 +812,6 @@ def test_replace_input_formats(self):
'C': ['', 'asdf', 'fd']})
assert_frame_equal(result, expected)

# dict to scalar
filled = df.replace(to_rep, 0)
expected = {}
for k, v in compat.iteritems(df):
expected[k] = v.replace(to_rep[k], 0)
assert_frame_equal(filled, DataFrame(expected))

self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

# scalar to dict
values = {'A': 0, 'B': -1, 'C': 'missing'}
df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
Expand All @@ -842,6 +833,20 @@ def test_replace_input_formats(self):

self.assertRaises(ValueError, df.replace, to_rep, values[1:])

def test_replace_input_formats_scalar(self):
df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
'C': ['', 'asdf', 'fd']})

# dict to scalar
to_rep = {'A': np.nan, 'B': 0, 'C': ''}
filled = df.replace(to_rep, 0)
expected = {}
for k, v in compat.iteritems(df):
expected[k] = v.replace(to_rep[k], 0)
assert_frame_equal(filled, DataFrame(expected))

self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

# list to scalar
to_rep = [np.nan, 0, '']
result = df.replace(to_rep, -1)
Expand Down
22 changes: 21 additions & 1 deletion pandas/tests/series/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@


class TestSeriesReplace(TestData, tm.TestCase):

def test_replace(self):
N = 100
ser = pd.Series(np.random.randn(N))
Expand Down Expand Up @@ -227,3 +226,24 @@ def test_replace_with_empty_dictlike(self):
s = pd.Series(list('abcd'))
tm.assert_series_equal(s, s.replace(dict()))
tm.assert_series_equal(s, s.replace(pd.Series([])))

def test_replace_string_with_number(self):
# GH 15743
s = pd.Series([1, 2, 3])
result = s.replace('2', np.nan)
expected = pd.Series([1, 2, 3])
tm.assert_series_equal(expected, result)

def test_replace_unicode_with_number(self):
# GH 15743
s = pd.Series([1, 2, 3])
result = s.replace(u'2', np.nan)
expected = pd.Series([1, 2, 3])
tm.assert_series_equal(expected, result)

def test_replace_mixed_types_with_string(self):
# Testing mixed
s = pd.Series([1, 2, 3, '4', 4, 5])
result = s.replace([2, '4'], np.nan)
expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
tm.assert_series_equal(expected, result)
50 changes: 35 additions & 15 deletions pandas/tests/types/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@

"""

from datetime import datetime
import pytest
from datetime import datetime, timedelta, date
import numpy as np

from pandas import Timedelta, Timestamp, DatetimeIndex
from pandas.types.cast import (maybe_downcast_to_dtype,
maybe_convert_objects,
infer_dtype_from_scalar,
infer_dtype_from_array,
maybe_convert_string_to_object,
maybe_convert_scalar,
find_common_type)
Expand Down Expand Up @@ -82,7 +84,7 @@ def test_datetime_with_timezone(self):
tm.assert_index_equal(res, exp)


class TestInferDtype(tm.TestCase):
class TestInferDtype(object):

def test_infer_dtype_from_scalar(self):
# Test that _infer_dtype_from_scalar is returning correct dtype for int
Expand All @@ -92,44 +94,62 @@ def test_infer_dtype_from_scalar(self):
np.int32, np.uint64, np.int64]:
data = dtypec(12)
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, type(data))
assert dtype == type(data)

data = 12
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.int64)
assert dtype == np.int64

for dtypec in [np.float16, np.float32, np.float64]:
data = dtypec(12)
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, dtypec)
assert dtype == dtypec

data = np.float(12)
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.float64)
assert dtype == np.float64

for data in [True, False]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.bool_)
assert dtype == np.bool_

for data in [np.complex64(1), np.complex128(1)]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.complex_)
assert dtype == np.complex_

import datetime
for data in [np.datetime64(1, 'ns'), Timestamp(1),
datetime.datetime(2000, 1, 1, 0, 0)]:
datetime(2000, 1, 1, 0, 0)]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, 'M8[ns]')
assert dtype == 'M8[ns]'

for data in [np.timedelta64(1, 'ns'), Timedelta(1),
datetime.timedelta(1)]:
timedelta(1)]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, 'm8[ns]')
assert dtype == 'm8[ns]'

for data in [datetime.date(2000, 1, 1),
for data in [date(2000, 1, 1),
Timestamp(1, tz='US/Eastern'), 'foo']:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.object_)
assert dtype == np.object_

@pytest.mark.parametrize(
"arr, expected",
[('foo', np.object_),
(b'foo', np.object_),
(1, np.int_),
(1.5, np.float_),
([1], np.int_),
(np.array([1]), np.int_),
([np.nan, 1, ''], np.object_),
(np.array([[1.0, 2.0]]), np.float_),
(Timestamp('20160101'), np.object_),
(np.datetime64('2016-01-01'), np.dtype('<M8[D]')),
])
def test_infer_dtype_from_array(self, arr, expected):

# these infer specifically to numpy dtypes
dtype, _ = infer_dtype_from_array(arr)
assert dtype == expected


class TestMaybe(tm.TestCase):
Expand Down
44 changes: 44 additions & 0 deletions pandas/types/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,50 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
return dtype, val


def infer_dtype_from_array(arr):
"""
infer the dtype from a scalar or array

Parameters
----------
arr : scalar or array

Returns
-------
tuple (numpy-compat dtype, array)

Notes
-----
These infer to numpy dtypes exactly
with the exception that mixed / object dtypes
are not coerced by stringifying or conversion

Examples
--------
>>> np.asarray([1, '1'])
array(['1', '1'], dtype='<U21')

>>> infer_dtype_from_array([1, '1'])
(numpy.object_, [1, '1'])

"""

if isinstance(arr, np.ndarray):
return arr.dtype, arr

if not is_list_like(arr):
arr = [arr]

# don't force numpy coerce with nan's
inferred = lib.infer_dtype(arr)
if inferred in ['string', 'bytes', 'unicode',
'mixed', 'mixed-integer']:
return (np.object_, arr)

arr = np.asarray(arr)
return arr.dtype, arr


def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
""" provide explict type promotion and coercion

Expand Down