diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bfec1ec3ebe8c..e7ecd829f344c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1909,3 +1909,27 @@ def pandas_dtype(dtype): raise TypeError('dtype {0} not understood'.format(dtype)) return npdtype + + +def _is_fillable_value(value): + pandas_ts_types = ('Timestamp', 'Period', 'Timedelta') + pandas_block_types = ('Series', 'DataFrame') + + if any([isinstance(value, (list, dict)), + callable(value), + (not (isinstance(value, string_types) or + isinstance(value, (int, float, complex, str, None.__class__)) or + is_numeric_dtype(value) or + is_datetime_or_timedelta_dtype(value) or + is_period_dtype(value) or + type(value).__name__ in pandas_ts_types) or + type(value).__name__ in pandas_block_types)]): + return False + else: + return True + + +def validate_fill_value(value): + if not _is_fillable_value(value): + raise TypeError('"value" parameter must be a scalar, but ' + 'you passed a "{0}"'.format(type(value).__name__)) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index af3a873bc2866..0846723170b3b 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -19,7 +19,10 @@ is_object_dtype, is_integer, _TD_DTYPE, - _NS_DTYPE) + _NS_DTYPE, + is_datetime64_any_dtype, is_float, + is_numeric_dtype, is_complex, is_period_arraylike) +from datetime import datetime, timedelta from .inference import is_list_like @@ -394,3 +397,32 @@ def na_value_for_dtype(dtype): elif is_bool_dtype(dtype): return False return np.nan + + +def is_valid_fill_value(value, dtype): + """ + Makes sure the fill value is appropriate for the given dtype. + + Parameters + ---------- + value : scalar + dtype: string / dtype + """ + if isinstance(value, dict): + return True + if not is_scalar(value): + # maybe always raise? + # raise TypeError('"value" parameter must be a scalar or dict, but ' + # 'you passed a "{0}"'.format(type(value).__name__)) + return False + elif isnull(value): + return True + elif is_bool_dtype(dtype): + return isinstance(value, (np.bool, bool)) + elif is_numeric_dtype(dtype): + return is_float(value) or is_integer(value) or is_complex(value) + elif is_datetime64_any_dtype(dtype): + return isinstance(value, (np.datetime64, datetime)) + elif is_timedelta64_dtype(dtype): + return isinstance(value, (np.timedelta64, timedelta)) + return True diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8b186bab29d5e..0a2b1c2c79d4c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -28,7 +28,7 @@ pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.missing import isnull, notnull -from pandas.core.dtypes.generic import ABCSeries, ABCPanel +from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame from pandas.core.common import (_values_from_object, _maybe_box_datetimelike, @@ -3735,9 +3735,27 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, @Appender(_shared_docs['fillna'] % _shared_doc_kwargs) def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None): + limit=None, downcast=None, errors=None): inplace = validate_bool_kwarg(inplace, 'inplace') + # if a singular fill value is provided, validate it + # special case: a DataFrame may be passed to a DataFrame + # in that case, short-circuit + if value is not None and not (isinstance(value, ABCDataFrame) and + isinstance(self, ABCDataFrame)): + # fill values by column, not all at once, to respect dtypes + if not isinstance(value, (dict, ABCSeries)) and \ + isinstance(self, ABCDataFrame): + value = {col: value for col in self.columns} + try: + missing.validate_fill_value(self, value) + except TypeError: + if errors == 'ignore': + return self + elif errors == 'raise': + raise + # if errors == 'coerce' continue + if isinstance(value, (list, tuple)): raise TypeError('"value" parameter must be a scalar or dict, but ' 'you passed a "{0}"'.format(type(value).__name__)) @@ -3756,7 +3774,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, if self._is_mixed_type and axis == 1: if inplace: raise NotImplementedError() - result = self.T.fillna(method=method, limit=limit).T + result = self.T.fillna(method=method, limit=limit, + errors=errors).T # need to downcast here because of all of the transposes result._data = result._data.downcast() @@ -3772,7 +3791,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, elif self.ndim == 3: # fill in 2d chunks - result = dict([(col, s.fillna(method=method, value=value)) + result = dict([(col, s.fillna(method=method, value=value, + errors=errors)) for col, s in self.iteritems()]) new_obj = self._constructor.\ from_dict(result).__finalize__(self) @@ -3804,7 +3824,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, - downcast=downcast) + downcast=downcast, + errors=errors) elif isinstance(value, (dict, ABCSeries)): if axis == 1: @@ -3817,12 +3838,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, if k not in result: continue obj = result[k] - obj.fillna(v, limit=limit, inplace=True, downcast=downcast) - return result + obj.fillna(v, limit=limit, inplace=True, + downcast=downcast, errors=errors) + return None if inplace else result elif not is_list_like(value): new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, - downcast=downcast) + downcast=downcast, + errors=errors) elif isinstance(value, DataFrame) and self.ndim == 2: new_data = self.where(self.notnull(), value) else: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f2a7ac76481d4..cc423bd3694b8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -362,10 +362,13 @@ def apply(self, func, mgr=None, **kwargs): return result def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): + errors=None, mgr=None): """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ + if not errors: + errors = 'coerce' + inplace = validate_bool_kwarg(inplace, 'inplace') if not self._can_hold_na: @@ -399,12 +402,16 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, if not mask.any(): return self if inplace else self.copy() - # we cannot coerce the underlying object, so - # make an ObjectBlock - return self.to_object_block(mgr=mgr).fillna(original_value, - limit=limit, - inplace=inplace, - downcast=False) + if errors == 'coerce': + # we cannot coerce the underlying object, so + # make an ObjectBlock + return self.to_object_block(mgr=mgr).fillna(original_value, + limit=limit, + inplace=inplace, + downcast=False, + errors='ignore') + else: # errors == 'ignore' + return self def _maybe_downcast(self, blocks, downcast=None): @@ -2132,11 +2139,14 @@ def _try_coerce_result(self, result): return result def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): + errors=None, mgr=None): # we may need to upcast our fill to match our dtype if limit is not None: raise NotImplementedError("specifying a limit for 'fillna' has " "not been implemented yet") + if errors is not None: + raise NotImplementedError("specifying error handling for 'fillna' " + "has not been implemented yet") values = self.values if inplace else self.values.copy() values = self._try_coerce_result(values.fillna(value=value, @@ -2626,11 +2636,13 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None, placement=self.mgr_locs) def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): + errors=None, mgr=None): # we may need to upcast our fill to match our dtype if limit is not None: raise NotImplementedError("specifying a limit for 'fillna' has " "not been implemented yet") + if errors is not None: + raise NotImplementedError values = self.values if inplace else self.values.copy() values = values.fillna(value, downcast=downcast) return [self.make_block_same_class(values=values, diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 5aabc9d8730dd..8d64e235a962d 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -20,7 +20,8 @@ _ensure_float64) from pandas.core.dtypes.cast import infer_dtype_from_array -from pandas.core.dtypes.missing import isnull +from pandas.core.dtypes.missing import isnull, is_valid_fill_value +from pandas.core.dtypes.generic import ABCSeries def mask_missing(arr, values_to_mask): @@ -634,6 +635,35 @@ def fill_zeros(result, x, y, name, fill): return result +def validate_fill_value(obj, value): + """ + + Fillna error coercion routine. + + Parameters + ---------- + obj : Series of DataFrame + The Series or DataFrame for which a fill value is being evaluated. + If obj is a DataFrame this method simply returns True (e.g. the fillna + operation is allowed to continue) because it will be broken up and + parsed as a sequence of sub-Series later on. + value : object + The value to be used as a fill for the object. + + Returns + ------- + continue : bool + Whether or not, based on the values and the error mode, the fill + operation ought to continue. + """ + if isinstance(obj, ABCSeries): + if not is_valid_fill_value(value, obj.dtype): + raise TypeError('"value" parameter must be compatible ' + 'with the {0} dtype, but you passed a ' + '"{1}"'.format(obj.dtype, + type(value).__name__)) + + def _interp_limit(invalid, fw_limit, bw_limit): """Get idx of values that won't be filled b/c they exceed the limits. diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 90993890b7553..9b5fc09218b28 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -2,8 +2,9 @@ from warnings import catch_warnings import numpy as np -from datetime import datetime +from datetime import datetime, timedelta from pandas.util import testing as tm +import pytest import pandas as pd from pandas.core import config as cf @@ -14,7 +15,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import ( array_equivalent, isnull, notnull, - na_value_for_dtype) + na_value_for_dtype, is_valid_fill_value) def test_notnull(): @@ -312,3 +313,35 @@ def test_na_value_for_dtype(): for dtype in ['O']: assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + +@pytest.mark.parametrize(('value', 'dtype'), + [(False, bool), (np.nan, bool), + (0, int), (0.0, int), (0j, int), (np.nan, int), + (0, float), (0.0, float), (0j, float), + (np.nan, float), + (0, complex), (0.0, complex), (0j, complex), + (np.nan, complex), + (False, str), (0, str), (0.0, str), (0j, str), + (np.nan, str), ('0', str), + (datetime(1970, 1, 1), np.datetime64), + (pd.Timestamp('1970-01-01'), np.datetime64), + (timedelta(0), np.timedelta64), + (pd.Timedelta(0), np.timedelta64)]) +def test_valid_fill_value(value, dtype): + assert is_valid_fill_value(value, dtype) + + +@pytest.mark.parametrize(('value', 'dtype'), + [(0, bool), (0.0, bool), (0j, bool), ('0', bool), + ('0', int), + ('0', float), + ('0', complex), + ('0', np.dtype('datetime64')), + (timedelta(0), np.dtype('datetime64')), + (pd.Period('1970-01-01'), np.dtype('datetime64')), + ('0', np.dtype('timedelta64')), + (datetime(1970, 1, 1), np.dtype('timedelta64')), + (pd.Period('1970-01-01'), np.dtype('timedelta64'))]) +def test_invalid_fill_value(value, dtype): + assert not is_valid_fill_value(value, dtype) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 77f0357685cab..248d6ea2f019a 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -518,6 +518,205 @@ def test_fill_value_when_combine_const(self): res = df.add(2, fill_value=0) assert_frame_equal(res, exp) + def test_fillna_error_modes_numeric_fill(self): + # Filling numeric/object cols with a numeric + df1 = DataFrame({'a': [nan, 1.0], + 'b': [nan, True], + 'c': [nan, 1], + 'd': [nan, 1j], + 'e': [nan, 'foo'],}) + expected = DataFrame({'a': [0.0, 1.0], + 'b': [0, True], + 'c': [0.0, 1.0], + 'd': [0, 1j], + 'e': [0, 'foo']}) + + result = df1.fillna(0, errors='coerce') + assert_frame_equal(result, expected) + result = df1.fillna(0, errors='ignore') + assert_frame_equal(result, expected) + result = df1.fillna(0, errors='raise') + assert_frame_equal(result, expected) + + def test_fillna_error_modes_bool_fill(self): + # Filling numeric/object cols with a bool + df1 = DataFrame({'a': [nan, 1.0], + 'b': [nan, True], + 'c': [nan, 1], + 'd': [nan, 1j], + 'e': [nan, 'foo']}) + + result = df1.fillna(False, errors='coerce') + expected = DataFrame({'a': [0.0, 1.0], + 'b': [False, True], + 'c': [0.0, 1.0], + 'd': [0.0, 1j], + 'e': [False, 'foo']}) + assert_frame_equal(result, expected) + + result = df1.fillna(False, errors='ignore') + expected = DataFrame({'a': [nan, 1.0], + 'b': [False, True], + 'c': [nan, 1.0], + 'd': [nan, 1j], + 'e': [False, 'foo']}) + assert_frame_equal(result, expected) + + with pytest.raises(TypeError): + df1.fillna(False, errors='raise') + + def test_fillna_error_modes_obj_fill(self): + # Filling numeric/object cols with an obj + df1 = DataFrame({'a': [nan, 1.0], + 'b': [nan, True], + 'c': [nan, 1], + 'd': [nan, 1j], + 'e': [nan, 'foo']}) + + result = df1.fillna('bar', errors='coerce') + expected = DataFrame({'a': ['bar', 1.0], + 'b': ['bar', True], + 'c': ['bar', 1.0], + 'd': ['bar', 1j], + 'e': ['bar', 'foo']}) + assert_frame_equal(result, expected) + + result = df1.fillna('bar', errors='ignore') + expected = DataFrame({'a': [nan, 1.0], + 'b': ['bar', True], # col cast to obj! + 'c': [nan, 1.0], + 'd': [nan, 1j], + 'e': ['bar', 'foo']}) + assert_frame_equal(result, expected) + + with pytest.raises(TypeError): + df1.fillna('bar', errors='raise') + + def test_fillna_error_modes_datetime_fill(self): + # Filling numeric/object/datetime cols with a datetime + timestamp = Timestamp('1970-01-01') + + df1 = DataFrame({'a': [nan, 1.0], + 'b': [nan, True], + 'c': [nan, 1], + 'd': [nan, 1j], + 'e': [nan, 'foo'], + 'f': [nan, timestamp]}) + + result = df1.fillna(Timestamp('1970-01-01'), errors='coerce') + expected = DataFrame({'a': [timestamp, 1.0], + 'b': [timestamp, True], + 'c': [timestamp, 1.0], + 'd': [timestamp, 1j], + 'e': [timestamp, 'foo'], + 'f': [timestamp, timestamp]}) + assert_frame_equal(result, expected) + + result = df1.fillna(timestamp, errors='ignore') + expected = DataFrame({'a': [nan, 1.0], + 'b': [timestamp, True], # col cast to obj! + 'c': [nan, 1.0], + 'd': [nan, 1j], + 'e': [timestamp, 'foo'], + 'f': [timestamp, timestamp]}) + assert_frame_equal(result, expected) + + with pytest.raises(TypeError): + df1.fillna(Timestamp('1970-01-01'), errors='raise') + + def test_fillna_error_modes_timedelta_fill(self): + # Filling numeric/object/timedelta cols with a timedelta + timedelta = pd.Timedelta('1 hour') + + df1 = DataFrame({'a': [nan, 1.0], + 'b': [nan, True], + 'c': [nan, 1], + 'd': [nan, 1j], + 'e': [nan, 'foo'], + 'f': [nan, timedelta]}) + + result = df1.fillna(timedelta, errors='coerce') + expected = DataFrame({'a': [timedelta, 1.0], + 'b': [timedelta, True], + 'c': [timedelta, 1.0], + 'd': [timedelta, 1j], + 'e': [timedelta, 'foo'], + 'f': [timedelta, timedelta]}) + assert_frame_equal(result, expected) + + result = df1.fillna(pd.Timedelta('1 hour'), errors='ignore') + expected = DataFrame({'a': [nan, 1.0], + 'b': [pd.Timedelta('1 hour'), True], # col cast to obj! + 'c': [nan, 1.0], + 'd': [nan, 1j], + 'e': [pd.Timedelta('1 hour'), 'foo'], + 'f': [pd.Timedelta('1 hour'), pd.Timedelta('1 hour')]}) + assert_frame_equal(result, expected) + + with pytest.raises(TypeError): + df1.fillna(Timestamp('1970-01-01'), errors='raise') + + def test_fillna_error_modes_period_fill(self): + # Filling numeric/object/period cols with a period + period = pd.Period('1 hour') + + df1 = DataFrame({'a': [nan, 1.0], + 'b': [nan, True], + 'c': [nan, 1], + 'd': [nan, 1j], + 'e': [nan, 'foo'], + 'f': [nan, period]}) + + result = df1.fillna(pd.Period('1 hour'), errors='coerce') + expected = DataFrame({'a': [period, 1.0], + 'b': [period, True], + 'c': [period, 1.0], + 'd': [period, 1j], + 'e': [period, 'foo'], + 'f': [period, period]}) + assert_frame_equal(result, expected) + + result = df1.fillna(period, errors='ignore') + expected = DataFrame({'a': [nan, 1.0], + 'b': [period, True], # col cast to obj! + 'c': [nan, 1.0], + 'd': [nan, 1j], + 'e': [period, 'foo'], + 'f': [period, period]}) + assert_frame_equal(result, expected) + + with pytest.raises(TypeError): + df1.fillna(period, errors='raise') + + def test_fillna_error_modes_time_dtype_interactions(self): + timedelta = pd.Timedelta('1 hour') + period = pd.Period('1 hour') + timestamp = Timestamp('1970-01-01') + + df1 = DataFrame({'a': [nan, timedelta], + 'b': [nan, period], + 'c': [nan, timestamp]}) + + result = df1.fillna(timedelta, errors='ignore') + expected = DataFrame({'a': [timedelta, timedelta], + 'b': [timedelta, period], # col cast to obj! + 'c': [nan, timestamp]}) + assert_frame_equal(result, expected) + + result = df1.fillna(period, errors='ignore') + expected = DataFrame({'a': [nan, timedelta], + 'b': [period, period], # col cast to obj! + 'c': [nan, timestamp]}) + assert_frame_equal(result, expected) + + result = df1.fillna(timestamp, errors='ignore') + expected = DataFrame({'a': [nan, timedelta], + 'b': [timestamp, period], # col cast to obj! + 'c': [timestamp, timestamp]}) + assert_frame_equal(result, expected) + + # TODO: coerce tests. + class TestDataFrameInterpolate(TestData):